View Javadoc

1   /*
2    * $Id: XmlReader.java,v 1.1.1.1 2000/11/23 01:53:33 edwingo Exp $
3    *
4    * The Apache Software License, Version 1.1
5    *
6    *
7    * Copyright (c) 2000 The Apache Software Foundation.  All rights 
8    * reserved.
9    *
10   * Redistribution and use in source and binary forms, with or without
11   * modification, are permitted provided that the following conditions
12   * are met:
13   *
14   * 1. Redistributions of source code must retain the above copyright
15   *    notice, this list of conditions and the following disclaimer. 
16   *
17   * 2. Redistributions in binary form must reproduce the above copyright
18   *    notice, this list of conditions and the following disclaimer in
19   *    the documentation and/or other materials provided with the
20   *    distribution.
21   *
22   * 3. The end-user documentation included with the redistribution,
23   *    if any, must include the following acknowledgment:  
24   *       "This product includes software developed by the
25   *        Apache Software Foundation (http://www.apache.org/)."
26   *    Alternately, this acknowledgment may appear in the software itself,
27   *    if and wherever such third-party acknowledgments normally appear.
28   *
29   * 4. The names "Crimson" and "Apache Software Foundation" must
30   *    not be used to endorse or promote products derived from this
31   *    software without prior written permission. For written 
32   *    permission, please contact apache@apache.org.
33   *
34   * 5. Products derived from this software may not be called "Apache",
35   *    nor may "Apache" appear in their name, without prior written
36   *    permission of the Apache Software Foundation.
37   *
38   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49   * SUCH DAMAGE.
50   * ====================================================================
51   *
52   * This software consists of voluntary contributions made by many
53   * individuals on behalf of the Apache Software Foundation and was
54   * originally based on software copyright (c) 1999, Sun Microsystems, Inc., 
55   * http://www.sun.com.  For more information on the Apache Software 
56   * Foundation, please see <http://www.apache.org/>.
57   */
58  package com.bea.xml.stream.reader;
59  
60  import java.io.*;
61  import java.util.Hashtable;
62  
63  /***
64   * This handles several XML-related tasks that normal java.io Readers
65   * don't support, inluding use of IETF standard encoding names and
66   * automatic detection of most XML encodings.  The former is needed
67   * for interoperability; the latter is needed to conform with the XML
68   * spec.  This class also optimizes reading some common encodings by
69   * providing low-overhead unsynchronized Reader support.
70   *
71   * <P> Note that the autodetection facility should be used only on
72   * data streams which have an unknown character encoding.  For example,
73   * it should never be used on MIME text/xml entities.
74   *
75   * <P> Note that XML processors are only required to support UTF-8 and
76   * UTF-16 character encodings.  Autodetection permits the underlying Java
77   * implementation to provide support for many other encodings, such as
78   * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
79   *
80   * @author David Brownell
81   * @version $Revision: 1.1.1.1 $
82   */
83  
84  final public class XmlReader extends Reader
85  {
86      private static final int MAXPUSHBACK = 512;
87  
88      private Reader	in;
89      private String	assignedEncoding;
90      private boolean	closed;
91  
92      //
93      // This class always delegates I/O to a reader, which gets
94      // its data from the very beginning of the XML text.  It needs
95      // to use a pushback stream since (a) autodetection can read
96      // partial UTF-8 characters which need to be fully processed,
97      // (b) the "Unicode" readers swallow characters that they think
98      // are byte order marks, so tests fail if they don't see the
99      // real byte order mark.
100     //
101     // It's got do this efficiently:  character I/O is solidly on the
102     // critical path.  (So keep buffer length over 2 Kbytes to avoid
103     // excess buffering. Many URL handlers stuff a BufferedInputStream
104     // between here and the real data source, and larger buffers keep
105     // that from slowing you down.)
106     //
107 
108     /***
109      * Constructs the reader from an input stream, autodetecting
110      * the encoding to use according to the heuristic specified
111      * in the XML 1.0 recommendation.
112      *
113      * @param in the input stream from which the reader is constructed
114      * @exception IOException on error, such as unrecognized encoding
115      */
116     public static Reader createReader (InputStream in) throws IOException
117     {
118 	return new XmlReader (in);
119     }
120 
121     /***
122      * Creates a reader supporting the given encoding, mapping
123      * from standard encoding names to ones that understood by
124      * Java where necessary.
125      *
126      * @param in the input stream from which the reader is constructed
127      * @param encoding the IETF standard name of the encoding to use;
128      *	if null, autodetection is used.
129      * @exception IOException on error, including unrecognized encoding
130      */
131     public static Reader createReader (InputStream in, String encoding)
132     throws IOException
133     {
134 	if (encoding == null)
135 	    return new XmlReader (in);
136 	if ("UTF-8".equalsIgnoreCase (encoding)
137 		|| "UTF8".equalsIgnoreCase (encoding))
138 	    return new Utf8Reader (in);
139 	if ("US-ASCII".equalsIgnoreCase (encoding)
140 		|| "ASCII".equalsIgnoreCase (encoding))
141 	    return new AsciiReader (in);
142 	if ("ISO-8859-1".equalsIgnoreCase (encoding)
143 		// plus numerous aliases ... 
144 		)
145 	    return new Iso8859_1Reader (in);
146 
147 	//
148 	// What we really want is an administerable resource mapping
149 	// encoding names/aliases to classnames.  For example a property
150 	// file resource, "readers/mapping.props", holding and a set
151 	// of readers in that (sub)package... defaulting to this call
152 	// only if no better choice is available.
153 	//
154 	return new InputStreamReader (in, std2java (encoding));
155     }
156 
157     //
158     // JDK doesn't know all of the standard encoding names, and
159     // in particular none of the EBCDIC ones IANA defines (and
160     // which IBM encourages).
161     //
162     static private final Hashtable charsets = new Hashtable (31);
163 
164     static {
165 	charsets.put ("UTF-16", "Unicode");
166 	charsets.put ("ISO-10646-UCS-2", "Unicode");
167 
168 	// NOTE: no support for ISO-10646-UCS-4 yet.
169 
170 	charsets.put ("EBCDIC-CP-US", "cp037");
171 	charsets.put ("EBCDIC-CP-CA", "cp037");
172 	charsets.put ("EBCDIC-CP-NL", "cp037");
173 	charsets.put ("EBCDIC-CP-WT", "cp037");
174 
175 	charsets.put ("EBCDIC-CP-DK", "cp277");
176 	charsets.put ("EBCDIC-CP-NO", "cp277");
177 	charsets.put ("EBCDIC-CP-FI", "cp278");
178 	charsets.put ("EBCDIC-CP-SE", "cp278");
179 
180 	charsets.put ("EBCDIC-CP-IT", "cp280");
181 	charsets.put ("EBCDIC-CP-ES", "cp284");
182 	charsets.put ("EBCDIC-CP-GB", "cp285");
183 	charsets.put ("EBCDIC-CP-FR", "cp297");
184 
185 	charsets.put ("EBCDIC-CP-AR1", "cp420");
186 	charsets.put ("EBCDIC-CP-HE", "cp424");
187 	charsets.put ("EBCDIC-CP-BE", "cp500");
188 	charsets.put ("EBCDIC-CP-CH", "cp500");
189 
190 	charsets.put ("EBCDIC-CP-ROECE", "cp870");
191 	charsets.put ("EBCDIC-CP-YU", "cp870");
192 	charsets.put ("EBCDIC-CP-IS", "cp871");
193 	charsets.put ("EBCDIC-CP-AR2", "cp918");
194 
195 	// IANA also defines two that JDK 1.2 doesn't handle:
196 	//	EBCDIC-CP-GR		--> CP423
197 	//	EBCDIC-CP-TR		--> CP905
198     }
199 
200     // returns an encoding name supported by JDK >= 1.1.6
201     // for some cases required by the XML spec
202     private static String std2java (String encoding)
203     {
204 	String temp = encoding.toUpperCase ();
205 	temp = (String) charsets.get (temp);
206 	return temp != null ? temp : encoding;
207     }
208 
209     /*** Returns the standard name of the encoding in use */
210     public String getEncoding ()
211     {
212 	return assignedEncoding;
213     }
214 
215     private XmlReader (InputStream stream) throws IOException
216     {
217 	super (stream);
218 
219 	PushbackInputStream	pb;
220         byte			buf [];
221 	int			len;
222 
223 	/*if (stream instanceof PushbackInputStream)
224 	    pb = (PushbackInputStream) stream;
225 	else*/
226 	/***
227 	 * Commented out the above code to make sure it works when the
228 	 * document is accessed using http. URL connection in the code uses
229 	 * a PushbackInputStream with size 7 and when we try to push back
230 	 * MAX which default value is set to 512 we get and exception. So
231 	 * that's why we need to wrap the stream irrespective of what type
232 	 * of stream we start off with.
233 	 */
234 	pb = new PushbackInputStream (stream, MAXPUSHBACK);
235 
236         //
237         // See if we can figure out the character encoding used
238         // in this file by peeking at the first few bytes.
239         //
240 	buf = new byte [4];
241 	len = pb.read (buf);
242 	if (len > 0)
243 	    pb.unread (buf, 0, len);
244 
245 	if (len == 4) switch (buf [0] & 0x0ff) {
246             case 0:
247               // 00 3c 00 3f == illegal UTF-16 big-endian
248               if (buf [1] == 0x3c && buf [2] == 0x00 && buf [3] == 0x3f) {
249 		  setEncoding (pb, "UnicodeBig");
250                   return;
251               }
252 	      // else it's probably UCS-4
253 	      break;
254 
255             case '<':      // 0x3c: the most common cases!
256               switch (buf [1] & 0x0ff) {
257                 // First character is '<'; could be XML without
258 		// an XML directive such as "<hello>", "<!-- ...",
259 		// and so on.
260                 default:
261                   break;
262 
263                 // 3c 00 3f 00 == illegal UTF-16 little endian
264                 case 0x00:
265                   if (buf [2] == 0x3f && buf [3] == 0x00) {
266 		      setEncoding (pb, "UnicodeLittle");
267 		      return;
268                   }
269 		  // else probably UCS-4
270 		  break;
271 
272                 // 3c 3f 78 6d == ASCII and supersets '<?xm'
273                 case '?': 
274                   if (buf [2] != 'x' || buf [3] != 'm')
275 		      break;
276 		  //
277 		  // One of several encodings could be used:
278                   // Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
279 		  //
280 		  useEncodingDecl (pb, "UTF8");
281                   return;
282               }
283 	      break;
284 
285             // 4c 6f a7 94 ... some EBCDIC code page
286             case 0x4c:
287               if (buf [1] == 0x6f
288 		    && (0x0ff & buf [2]) == 0x0a7
289 		    && (0x0ff & buf [3]) == 0x094) {
290 		  useEncodingDecl (pb, "CP037");
291 		  return;
292 	      }
293 	      // whoops, treat as UTF-8
294 	      break;
295 
296             // UTF-16 big-endian
297             case 0xfe:
298               if ((buf [1] & 0x0ff) != 0xff)
299                   break;
300 	      setEncoding (pb, "UTF-16");
301               return;
302 
303             // UTF-16 little-endian
304             case 0xff:
305               if ((buf [1] & 0x0ff) != 0xfe)
306                   break;
307 	      setEncoding (pb, "UTF-16");
308 	      return;
309 
310             // default ... no XML declaration
311             default:
312               break;
313         }
314 
315 	//
316         // If all else fails, assume XML without a declaration, and
317         // using UTF-8 encoding.
318 	//
319 	setEncoding (pb, "UTF-8");
320     }
321 
322     /*
323      * Read the encoding decl on the stream, knowing that it should
324      * be readable using the specified encoding (basically, ASCII or
325      * EBCDIC).  The body of the document may use a wider range of
326      * characters than the XML/Text decl itself, so we switch to use
327      * the specified encoding as soon as we can.  (ASCII is a subset
328      * of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
329      * has a variety of "code pages" that have these characters as
330      * a common subset.)
331      */
332     private void useEncodingDecl (PushbackInputStream pb, String encoding)
333     throws IOException
334     {
335 	byte			buffer [] = new byte [MAXPUSHBACK];
336 	int			len;
337 	Reader			r;
338 	int			c;
339 
340 	//
341 	// Buffer up a bunch of input, and set up to read it in
342 	// the specified encoding ... we can skip the first four
343 	// bytes since we know that "<?xm" was read to determine
344 	// what encoding to use!
345 	//
346 	len = pb.read (buffer, 0, buffer.length);
347 	pb.unread (buffer, 0, len);
348 	r = new InputStreamReader (
349 		new ByteArrayInputStream (buffer, 4, len),
350 		encoding);
351 
352 	//
353 	// Next must be "l" (and whitespace) else we conclude
354 	// error and choose UTF-8.
355 	//
356 	if ((c = r.read ()) != 'l') {
357 	    setEncoding (pb, "UTF-8");
358 	    return;
359 	}
360 
361 	//
362 	// Then, we'll skip any
363 	// 	S version="..." 	[or single quotes]
364 	// bit and get any subsequent 
365 	// 	S encoding="..." 	[or single quotes]
366 	//
367 	// We put an arbitrary size limit on how far we read; lots
368 	// of space will break this algorithm.
369 	//
370 	StringBuffer	buf = new StringBuffer ();
371 	StringBuffer	keyBuf = null;
372 	String		key = null;
373 	boolean		sawEq = false;
374 	char		quoteChar = 0;
375 	boolean		sawQuestion = false;
376 
377     XmlDecl:
378 	for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
379 	    if ((c = r.read ()) == -1)
380 		break;
381 
382 	    // ignore whitespace before/between "key = 'value'"
383 	    if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
384 		continue;
385 
386 	    // ... but require at least a little!
387 	    if (i == 0)
388 		break;
389 	    
390 	    // terminate the loop ASAP
391 	    if (c == '?')
392 		sawQuestion = true;
393 	    else if (sawQuestion) {
394 		if (c == '>')
395 		    break;
396 		sawQuestion = false;
397 	    }
398 	    
399 	    // did we get the "key =" bit yet?
400 	    if (key == null || !sawEq) {
401 		if (keyBuf == null) {
402 		    if (Character.isWhitespace ((char) c))
403 			continue;
404 		    keyBuf = buf;
405 		    buf.setLength (0);
406 		    buf.append ((char)c);
407 		    sawEq = false;
408 		} else if (Character.isWhitespace ((char) c)) {
409 		    key = keyBuf.toString ();
410 		} else if (c == '=') {
411 		    if (key == null)
412 			key = keyBuf.toString ();
413 		    sawEq = true;
414 		    keyBuf = null;
415 		    quoteChar = 0;
416 		} else
417 		    keyBuf.append ((char)c);
418 		continue;
419 	    }
420 
421 	    // space before quoted value
422 	    if (Character.isWhitespace ((char) c))
423 		continue;
424 	    if (c == '"' || c == '\'') {
425 		if (quoteChar == 0) {
426 		    quoteChar = (char) c;
427 		    buf.setLength (0);
428 		    continue;
429 		} else if (c == quoteChar) {
430 		    if ("encoding".equals (key)) {
431 			assignedEncoding = buf.toString ();
432 
433 			// [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
434 			for (i = 0; i < assignedEncoding.length(); i++) {
435 			    c = assignedEncoding.charAt (i);
436 			    if ((c >= 'A' && c <= 'Z')
437 				    || (c >= 'a' && c <= 'z'))
438 				continue;
439 			    if (i == 0)
440 				break XmlDecl;
441 			    if (i > 0 && (c == '-'
442 				    || (c >= '0' && c <= '9')
443 				    || c == '.' || c == '_'))
444 				continue;
445 			    // map illegal names to UTF-8 default
446 			    break XmlDecl;
447 			}
448 
449 			setEncoding (pb, assignedEncoding);
450 			return;
451 
452 		    } else {
453 			key = null;
454 			continue;
455 		    }
456 		}
457 	    }
458 	    buf.append ((char) c);
459 	}
460 
461 	setEncoding (pb, "UTF-8");
462     }
463 
464     private void setEncoding (InputStream stream, String encoding)
465     throws IOException
466     {
467 	assignedEncoding = encoding;
468 	in = createReader (stream, encoding);
469     }
470 
471     /***
472      * Reads the number of characters read into the buffer, or -1 on EOF.
473      */
474     public int read (char buf [], int off, int len) throws IOException
475     {
476 	int	val;
477 
478 	if (closed)
479 	    return -1;		// throw new IOException ("closed");
480 	val = in.read (buf, off, len);
481 	if (val == -1)
482 	    close ();
483 	return val;
484     }
485 
486     /***
487      * Reads a single character.
488      */
489     public int read () throws IOException
490     {
491 	int	val;
492 
493 	if (closed)
494 	    throw new IOException ("closed");
495 	val = in.read ();
496 	if (val == -1)
497 	    close ();
498 	return val;
499     }
500 
501     /***
502      * Returns true iff the reader supports mark/reset.
503      */
504     public boolean markSupported ()
505     {
506 	return in == null ? false : in.markSupported ();
507     }
508 
509     /***
510      * Sets a mark allowing a limited number of characters to
511      * be "peeked", by reading and then resetting.
512      * @param value how many characters may be "peeked".
513      */
514     public void mark (int value) throws IOException
515     {
516 	if (in != null) in.mark (value);
517     }
518 
519     /***
520      * Resets the current position to the last marked position.
521      */
522     public void reset () throws IOException
523     {
524 	if (in != null) in.reset ();
525     }
526 
527     /***
528      * Skips a specified number of characters.
529      */
530     public long skip (long value) throws IOException
531     {
532 	return in == null ? 0 : in.skip (value);
533     }
534 
535     /***
536      * Returns true iff input characters are known to be ready.
537      */
538     public boolean ready () throws IOException
539     {
540 	return in == null ? false : in.ready ();
541     }
542 
543     /***
544      * Closes the reader.
545      */
546     public void close () throws IOException
547     {
548 	if (closed)
549 	    return;
550 	in.close ();
551 	in = null;
552 	closed = true;
553     }
554 
555     //
556     // Delegating to a converter module will always be slower than
557     // direct conversion.  Use a similar approach for any other
558     // readers that need to be particularly fast; only block I/O
559     // speed matters to this package.  For UTF-16, separate readers
560     // for big and little endian streams make a difference, too;
561     // fewer conditionals in the critical path!
562     //
563     static abstract class BaseReader extends Reader
564     {
565 	protected InputStream	instream;
566 	protected byte		buffer [];
567 	protected int		start, finish;
568         
569 	BaseReader (InputStream stream)
570 	{
571 	    super (stream);
572 
573 	    instream = stream;
574             buffer = new byte [8192];
575 
576 	}
577 
578 	public boolean ready () throws IOException
579 	{
580 	    return instream == null
581 		|| (finish - start) > 0
582 		||  instream.available () != 0;
583 	}
584 
585 	// caller shouldn't read again
586 	public void close () throws IOException
587 	{
588 	    if (instream != null) {
589 		instream.close ();
590 		start = finish = 0;
591 		buffer = null;
592 		instream = null;
593 	    }
594 	}
595     }
596 
597     //
598     // We want this reader, to make the default encoding be as fast
599     // as we can make it.  JDK's "UTF8" (not "UTF-8" till JDK 1.2)
600     // InputStreamReader works, but 20+% slower speed isn't OK for
601     // the default/primary encoding.
602     //
603     static final class Utf8Reader extends BaseReader
604     {
605 	// 2nd half of UTF-8 surrogate pair
606 	private char		nextChar;
607 
608 	Utf8Reader (InputStream stream)
609 	{
610 	    super (stream);
611 	}
612 
613 	public int read (char buf [], int offset, int len) throws IOException
614 	{
615 	    int i = 0, c = 0;
616 
617 	    if (len <= 0)
618 		return 0;
619 	 
620 	    // avoid many runtime bounds checks ... a good optimizer
621             // (static or JIT) will now remove checks from the loop.
622             if ((offset + len) > buf.length || offset < 0)
623                 throw new ArrayIndexOutOfBoundsException ();
624 
625 	    // Consume remaining half of any surrogate pair immediately
626 	    if (nextChar != 0) {
627 		buf [offset + i++] = nextChar;
628 		nextChar = 0;
629 	    }
630 
631 	    while (i < len) {
632 		// stop or read data if needed
633 		if (finish <= start) {
634 		    if (instream == null) {
635 			c = -1;
636 			break;
637 		    }
638 		    start = 0;
639 		    finish = instream.read (buffer, 0, buffer.length);
640 		    if (finish <= 0) {
641 			this.close ();
642 			c = -1;
643 			break;
644 		    }
645 		}
646 		
647 		//
648 		// RFC 2279 describes UTF-8; there are six encodings.
649 		// Each encoding takes a fixed number of characters
650 		// (1-6 bytes) and is flagged by a bit pattern in the
651 		// first byte.  The five and six byte-per-character
652 		// encodings address characters which are disallowed
653 		// in XML documents, as do some four byte ones.
654 		// 
655 
656 		//
657 		// Single byte == ASCII.  Common; optimize.
658 		//
659 		c = buffer [start] & 0x0ff;
660 		if ((c & 0x80) == 0x00) {
661 		    // 0x0000 <= c <= 0x007f
662 		    start++;
663 		    buf [offset + i++] = (char) c;
664 		    continue;
665 		}
666 		
667 		//
668 		// Multibyte chars -- check offsets optimistically,
669 		// ditto the "10xx xxxx" format for subsequent bytes
670 		//
671 		int		off = start;
672 		
673 		try {
674 		    // 2 bytes
675 		    if ((buffer [off] & 0x0E0) == 0x0C0) {
676 			c  = (buffer [off++] & 0x1f) << 6;
677 			c +=  buffer [off++] & 0x3f;
678 
679 			// 0x0080 <= c <= 0x07ff
680 
681 		    // 3 bytes
682 		    } else if ((buffer [off] & 0x0F0) == 0x0E0) {
683 			c  = (buffer [off++] & 0x0f) << 12;
684 			c += (buffer [off++] & 0x3f) << 6;
685 			c +=  buffer [off++] & 0x3f;
686 
687 			// 0x0800 <= c <= 0xffff
688 
689 		    // 4 bytes
690 		    } else if ((buffer [off] & 0x0f8) == 0x0F0) {
691 			c  = (buffer [off++] & 0x07) << 18;
692 			c += (buffer [off++] & 0x3f) << 12;
693 			c += (buffer [off++] & 0x3f) << 6;
694 			c +=  buffer [off++] & 0x3f;
695 
696 			// 0x0001 0000  <= c  <= 0x001f ffff
697 
698 			// Unicode supports c <= 0x0010 ffff ...
699 			if (c > 0x0010ffff)
700 			    throw new CharConversionException (
701 				"UTF-8 encoding of character 0x00"
702 				+ Integer.toHexString (c)
703 				+ " can't be converted to Unicode."
704 				);
705 
706 			else if (c > 0xffff) {
707 			    // Convert UCS-4 char to surrogate pair (UTF-16)
708 			    c -= 0x10000;
709 			    nextChar = (char) (0xDC00 + (c & 0x03ff));
710 			    c = 0xD800 + (c >> 10);
711 			}
712 		        // 5 and 6 byte versions are XML WF errors, but
713 		        // typically come from mislabeled encodings
714 		    } else
715 			throw new CharConversionException (
716 			    "Unconvertible UTF-8 character"
717 			    + " beginning with 0x"
718 			    + Integer.toHexString (
719 				buffer [start] & 0xff)
720 			);
721 
722 		} catch (ArrayIndexOutOfBoundsException e) {
723 		    // off > length && length >= buffer.length
724 		    c = 0;
725 		}
726 
727 		//
728 		// if the buffer held only a partial character,
729 		// compact it and try to read the rest of the
730 		// character.  worst case involves three
731 		// single-byte reads -- quite rare.
732 		//
733 		if (off > finish) {
734 		    System.arraycopy (buffer, start,
735 			    buffer, 0, finish - start);
736 		    finish -= start;
737 		    start = 0;
738 		    off = instream.read (buffer, finish,
739 			    buffer.length - finish);
740 		    if (off < 0) {
741 			this.close ();
742 			throw new CharConversionException (
743 			    "Partial UTF-8 char");
744 		    }
745 		    finish += off;
746 		    continue;
747 		}
748 
749 		//
750 		// check the format of the non-initial bytes
751 		//
752 		for (start++; start < off; start++) {
753 		    if ((buffer [start] & 0xC0) != 0x80) {
754 			this.close ();
755 			throw new CharConversionException (
756 			    "Malformed UTF-8 char -- "
757 			    + "is an XML encoding declaration missing?"
758 			    );
759 		    }
760 		}
761 
762 		//
763 		// If this needed a surrogate pair, consume ASAP
764 		//
765 		buf [offset + i++] = (char) c;
766 		if (nextChar != 0 && i < len) {
767 		    buf [offset + i++] = nextChar;
768 		    nextChar = 0;
769 		}
770 	    }
771 	    if (i > 0)
772 		return i;
773 	    return (c == -1) ? -1 : 0;
774 	}
775     }
776 
777     //
778     // We want ASCII and ISO-8859 Readers since they're the most common
779     // encodings in the US and Europe, and we don't want performance
780     // regressions for them.  They're also easy to implement efficiently,
781     // since they're bitmask subsets of UNICODE.
782     //
783     // XXX haven't benchmarked these readers vs what we get out of JDK.
784     //
785     static final class AsciiReader extends BaseReader
786     {
787 	AsciiReader (InputStream in) { super (in); }
788 
789 	public int read (char buf [], int offset, int len) throws IOException
790 	{
791 	    int		i, c;
792 
793 	    if (instream == null)
794 		return -1;
795 
796 	    // avoid many runtime bounds checks ... a good optimizer
797             // (static or JIT) will now remove checks from the loop.
798             if ((offset + len) > buf.length || offset < 0)
799                 throw new ArrayIndexOutOfBoundsException ();
800 
801 	    for (i = 0; i < len; i++) {
802 		if (start >= finish) {
803 		    start = 0;
804 		    finish = instream.read (buffer, 0, buffer.length);
805 		    if (finish <= 0) {
806 			if (finish <= 0)
807 			    this.close ();
808 			break;
809 		    }
810 		}
811 		c = buffer [start++];
812 		if ((c & 0x80) != 0)
813 		    throw new CharConversionException (
814 			"Illegal ASCII character, 0x"
815 			+ Integer.toHexString (c & 0xff)
816 		    );
817 		buf [offset + i] = (char) c;
818 	    }
819 	    if (i == 0 && finish <= 0)
820 		return -1;
821 	    return i;
822 	}
823     }
824 
825     static final class Iso8859_1Reader extends BaseReader
826     {
827 	Iso8859_1Reader (InputStream in) { super (in); }
828 
829 	public int read (char buf [], int offset, int len) throws IOException
830 	{
831 	    int		i;
832 
833 	    if (instream == null)
834 		return -1;
835 
836 	    // avoid many runtime bounds checks ... a good optimizer
837             // (static or JIT) will now remove checks from the loop.
838             if ((offset + len) > buf.length || offset < 0)
839                 throw new ArrayIndexOutOfBoundsException ();
840 
841 	    for (i = 0; i < len; i++) {
842 		if (start >= finish) {
843 		    start = 0;
844 		    finish = instream.read (buffer, 0, buffer.length);
845 		    if (finish <= 0) {
846 			if (finish <= 0)
847 			    this.close ();
848 			break;
849 		    }
850 		}
851 		buf [offset + i] = (char) (0x0ff & buffer [start++]);
852 	    }
853 	    if (i == 0 && finish <= 0)
854 		return -1;
855 	    return i;
856 	}
857     }
858 }