1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58 package com.bea.xml.stream.reader;
59
60 import java.io.*;
61 import java.util.Hashtable;
62
63 /***
64 * This handles several XML-related tasks that normal java.io Readers
65 * don't support, inluding use of IETF standard encoding names and
66 * automatic detection of most XML encodings. The former is needed
67 * for interoperability; the latter is needed to conform with the XML
68 * spec. This class also optimizes reading some common encodings by
69 * providing low-overhead unsynchronized Reader support.
70 *
71 * <P> Note that the autodetection facility should be used only on
72 * data streams which have an unknown character encoding. For example,
73 * it should never be used on MIME text/xml entities.
74 *
75 * <P> Note that XML processors are only required to support UTF-8 and
76 * UTF-16 character encodings. Autodetection permits the underlying Java
77 * implementation to provide support for many other encodings, such as
78 * US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
79 *
80 * @author David Brownell
81 * @version $Revision: 1.1.1.1 $
82 */
83
84 final public class XmlReader extends Reader
85 {
86 private static final int MAXPUSHBACK = 512;
87
88 private Reader in;
89 private String assignedEncoding;
90 private boolean closed;
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108 /***
109 * Constructs the reader from an input stream, autodetecting
110 * the encoding to use according to the heuristic specified
111 * in the XML 1.0 recommendation.
112 *
113 * @param in the input stream from which the reader is constructed
114 * @exception IOException on error, such as unrecognized encoding
115 */
116 public static Reader createReader (InputStream in) throws IOException
117 {
118 return new XmlReader (in);
119 }
120
121 /***
122 * Creates a reader supporting the given encoding, mapping
123 * from standard encoding names to ones that understood by
124 * Java where necessary.
125 *
126 * @param in the input stream from which the reader is constructed
127 * @param encoding the IETF standard name of the encoding to use;
128 * if null, autodetection is used.
129 * @exception IOException on error, including unrecognized encoding
130 */
131 public static Reader createReader (InputStream in, String encoding)
132 throws IOException
133 {
134 if (encoding == null)
135 return new XmlReader (in);
136 if ("UTF-8".equalsIgnoreCase (encoding)
137 || "UTF8".equalsIgnoreCase (encoding))
138 return new Utf8Reader (in);
139 if ("US-ASCII".equalsIgnoreCase (encoding)
140 || "ASCII".equalsIgnoreCase (encoding))
141 return new AsciiReader (in);
142 if ("ISO-8859-1".equalsIgnoreCase (encoding)
143
144 )
145 return new Iso8859_1Reader (in);
146
147
148
149
150
151
152
153
154 return new InputStreamReader (in, std2java (encoding));
155 }
156
157
158
159
160
161
162 static private final Hashtable charsets = new Hashtable (31);
163
164 static {
165 charsets.put ("UTF-16", "Unicode");
166 charsets.put ("ISO-10646-UCS-2", "Unicode");
167
168
169
170 charsets.put ("EBCDIC-CP-US", "cp037");
171 charsets.put ("EBCDIC-CP-CA", "cp037");
172 charsets.put ("EBCDIC-CP-NL", "cp037");
173 charsets.put ("EBCDIC-CP-WT", "cp037");
174
175 charsets.put ("EBCDIC-CP-DK", "cp277");
176 charsets.put ("EBCDIC-CP-NO", "cp277");
177 charsets.put ("EBCDIC-CP-FI", "cp278");
178 charsets.put ("EBCDIC-CP-SE", "cp278");
179
180 charsets.put ("EBCDIC-CP-IT", "cp280");
181 charsets.put ("EBCDIC-CP-ES", "cp284");
182 charsets.put ("EBCDIC-CP-GB", "cp285");
183 charsets.put ("EBCDIC-CP-FR", "cp297");
184
185 charsets.put ("EBCDIC-CP-AR1", "cp420");
186 charsets.put ("EBCDIC-CP-HE", "cp424");
187 charsets.put ("EBCDIC-CP-BE", "cp500");
188 charsets.put ("EBCDIC-CP-CH", "cp500");
189
190 charsets.put ("EBCDIC-CP-ROECE", "cp870");
191 charsets.put ("EBCDIC-CP-YU", "cp870");
192 charsets.put ("EBCDIC-CP-IS", "cp871");
193 charsets.put ("EBCDIC-CP-AR2", "cp918");
194
195
196
197
198 }
199
200
201
202 private static String std2java (String encoding)
203 {
204 String temp = encoding.toUpperCase ();
205 temp = (String) charsets.get (temp);
206 return temp != null ? temp : encoding;
207 }
208
209 /*** Returns the standard name of the encoding in use */
210 public String getEncoding ()
211 {
212 return assignedEncoding;
213 }
214
215 private XmlReader (InputStream stream) throws IOException
216 {
217 super (stream);
218
219 PushbackInputStream pb;
220 byte buf [];
221 int len;
222
223
224
225
226 /***
227 * Commented out the above code to make sure it works when the
228 * document is accessed using http. URL connection in the code uses
229 * a PushbackInputStream with size 7 and when we try to push back
230 * MAX which default value is set to 512 we get and exception. So
231 * that's why we need to wrap the stream irrespective of what type
232 * of stream we start off with.
233 */
234 pb = new PushbackInputStream (stream, MAXPUSHBACK);
235
236
237
238
239
240 buf = new byte [4];
241 len = pb.read (buf);
242 if (len > 0)
243 pb.unread (buf, 0, len);
244
245 if (len == 4) switch (buf [0] & 0x0ff) {
246 case 0:
247
248 if (buf [1] == 0x3c && buf [2] == 0x00 && buf [3] == 0x3f) {
249 setEncoding (pb, "UnicodeBig");
250 return;
251 }
252
253 break;
254
255 case '<':
256 switch (buf [1] & 0x0ff) {
257
258
259
260 default:
261 break;
262
263
264 case 0x00:
265 if (buf [2] == 0x3f && buf [3] == 0x00) {
266 setEncoding (pb, "UnicodeLittle");
267 return;
268 }
269
270 break;
271
272
273 case '?':
274 if (buf [2] != 'x' || buf [3] != 'm')
275 break;
276
277
278
279
280 useEncodingDecl (pb, "UTF8");
281 return;
282 }
283 break;
284
285
286 case 0x4c:
287 if (buf [1] == 0x6f
288 && (0x0ff & buf [2]) == 0x0a7
289 && (0x0ff & buf [3]) == 0x094) {
290 useEncodingDecl (pb, "CP037");
291 return;
292 }
293
294 break;
295
296
297 case 0xfe:
298 if ((buf [1] & 0x0ff) != 0xff)
299 break;
300 setEncoding (pb, "UTF-16");
301 return;
302
303
304 case 0xff:
305 if ((buf [1] & 0x0ff) != 0xfe)
306 break;
307 setEncoding (pb, "UTF-16");
308 return;
309
310
311 default:
312 break;
313 }
314
315
316
317
318
319 setEncoding (pb, "UTF-8");
320 }
321
322
323
324
325
326
327
328
329
330
331
332 private void useEncodingDecl (PushbackInputStream pb, String encoding)
333 throws IOException
334 {
335 byte buffer [] = new byte [MAXPUSHBACK];
336 int len;
337 Reader r;
338 int c;
339
340
341
342
343
344
345
346 len = pb.read (buffer, 0, buffer.length);
347 pb.unread (buffer, 0, len);
348 r = new InputStreamReader (
349 new ByteArrayInputStream (buffer, 4, len),
350 encoding);
351
352
353
354
355
356 if ((c = r.read ()) != 'l') {
357 setEncoding (pb, "UTF-8");
358 return;
359 }
360
361
362
363
364
365
366
367
368
369
370 StringBuffer buf = new StringBuffer ();
371 StringBuffer keyBuf = null;
372 String key = null;
373 boolean sawEq = false;
374 char quoteChar = 0;
375 boolean sawQuestion = false;
376
377 XmlDecl:
378 for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
379 if ((c = r.read ()) == -1)
380 break;
381
382
383 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
384 continue;
385
386
387 if (i == 0)
388 break;
389
390
391 if (c == '?')
392 sawQuestion = true;
393 else if (sawQuestion) {
394 if (c == '>')
395 break;
396 sawQuestion = false;
397 }
398
399
400 if (key == null || !sawEq) {
401 if (keyBuf == null) {
402 if (Character.isWhitespace ((char) c))
403 continue;
404 keyBuf = buf;
405 buf.setLength (0);
406 buf.append ((char)c);
407 sawEq = false;
408 } else if (Character.isWhitespace ((char) c)) {
409 key = keyBuf.toString ();
410 } else if (c == '=') {
411 if (key == null)
412 key = keyBuf.toString ();
413 sawEq = true;
414 keyBuf = null;
415 quoteChar = 0;
416 } else
417 keyBuf.append ((char)c);
418 continue;
419 }
420
421
422 if (Character.isWhitespace ((char) c))
423 continue;
424 if (c == '"' || c == '\'') {
425 if (quoteChar == 0) {
426 quoteChar = (char) c;
427 buf.setLength (0);
428 continue;
429 } else if (c == quoteChar) {
430 if ("encoding".equals (key)) {
431 assignedEncoding = buf.toString ();
432
433
434 for (i = 0; i < assignedEncoding.length(); i++) {
435 c = assignedEncoding.charAt (i);
436 if ((c >= 'A' && c <= 'Z')
437 || (c >= 'a' && c <= 'z'))
438 continue;
439 if (i == 0)
440 break XmlDecl;
441 if (i > 0 && (c == '-'
442 || (c >= '0' && c <= '9')
443 || c == '.' || c == '_'))
444 continue;
445
446 break XmlDecl;
447 }
448
449 setEncoding (pb, assignedEncoding);
450 return;
451
452 } else {
453 key = null;
454 continue;
455 }
456 }
457 }
458 buf.append ((char) c);
459 }
460
461 setEncoding (pb, "UTF-8");
462 }
463
464 private void setEncoding (InputStream stream, String encoding)
465 throws IOException
466 {
467 assignedEncoding = encoding;
468 in = createReader (stream, encoding);
469 }
470
471 /***
472 * Reads the number of characters read into the buffer, or -1 on EOF.
473 */
474 public int read (char buf [], int off, int len) throws IOException
475 {
476 int val;
477
478 if (closed)
479 return -1;
480 val = in.read (buf, off, len);
481 if (val == -1)
482 close ();
483 return val;
484 }
485
486 /***
487 * Reads a single character.
488 */
489 public int read () throws IOException
490 {
491 int val;
492
493 if (closed)
494 throw new IOException ("closed");
495 val = in.read ();
496 if (val == -1)
497 close ();
498 return val;
499 }
500
501 /***
502 * Returns true iff the reader supports mark/reset.
503 */
504 public boolean markSupported ()
505 {
506 return in == null ? false : in.markSupported ();
507 }
508
509 /***
510 * Sets a mark allowing a limited number of characters to
511 * be "peeked", by reading and then resetting.
512 * @param value how many characters may be "peeked".
513 */
514 public void mark (int value) throws IOException
515 {
516 if (in != null) in.mark (value);
517 }
518
519 /***
520 * Resets the current position to the last marked position.
521 */
522 public void reset () throws IOException
523 {
524 if (in != null) in.reset ();
525 }
526
527 /***
528 * Skips a specified number of characters.
529 */
530 public long skip (long value) throws IOException
531 {
532 return in == null ? 0 : in.skip (value);
533 }
534
535 /***
536 * Returns true iff input characters are known to be ready.
537 */
538 public boolean ready () throws IOException
539 {
540 return in == null ? false : in.ready ();
541 }
542
543 /***
544 * Closes the reader.
545 */
546 public void close () throws IOException
547 {
548 if (closed)
549 return;
550 in.close ();
551 in = null;
552 closed = true;
553 }
554
555
556
557
558
559
560
561
562
563 static abstract class BaseReader extends Reader
564 {
565 protected InputStream instream;
566 protected byte buffer [];
567 protected int start, finish;
568
569 BaseReader (InputStream stream)
570 {
571 super (stream);
572
573 instream = stream;
574 buffer = new byte [8192];
575
576 }
577
578 public boolean ready () throws IOException
579 {
580 return instream == null
581 || (finish - start) > 0
582 || instream.available () != 0;
583 }
584
585
586 public void close () throws IOException
587 {
588 if (instream != null) {
589 instream.close ();
590 start = finish = 0;
591 buffer = null;
592 instream = null;
593 }
594 }
595 }
596
597
598
599
600
601
602
603 static final class Utf8Reader extends BaseReader
604 {
605
606 private char nextChar;
607
608 Utf8Reader (InputStream stream)
609 {
610 super (stream);
611 }
612
613 public int read (char buf [], int offset, int len) throws IOException
614 {
615 int i = 0, c = 0;
616
617 if (len <= 0)
618 return 0;
619
620
621
622 if ((offset + len) > buf.length || offset < 0)
623 throw new ArrayIndexOutOfBoundsException ();
624
625
626 if (nextChar != 0) {
627 buf [offset + i++] = nextChar;
628 nextChar = 0;
629 }
630
631 while (i < len) {
632
633 if (finish <= start) {
634 if (instream == null) {
635 c = -1;
636 break;
637 }
638 start = 0;
639 finish = instream.read (buffer, 0, buffer.length);
640 if (finish <= 0) {
641 this.close ();
642 c = -1;
643 break;
644 }
645 }
646
647
648
649
650
651
652
653
654
655
656
657
658
659 c = buffer [start] & 0x0ff;
660 if ((c & 0x80) == 0x00) {
661
662 start++;
663 buf [offset + i++] = (char) c;
664 continue;
665 }
666
667
668
669
670
671 int off = start;
672
673 try {
674
675 if ((buffer [off] & 0x0E0) == 0x0C0) {
676 c = (buffer [off++] & 0x1f) << 6;
677 c += buffer [off++] & 0x3f;
678
679
680
681
682 } else if ((buffer [off] & 0x0F0) == 0x0E0) {
683 c = (buffer [off++] & 0x0f) << 12;
684 c += (buffer [off++] & 0x3f) << 6;
685 c += buffer [off++] & 0x3f;
686
687
688
689
690 } else if ((buffer [off] & 0x0f8) == 0x0F0) {
691 c = (buffer [off++] & 0x07) << 18;
692 c += (buffer [off++] & 0x3f) << 12;
693 c += (buffer [off++] & 0x3f) << 6;
694 c += buffer [off++] & 0x3f;
695
696
697
698
699 if (c > 0x0010ffff)
700 throw new CharConversionException (
701 "UTF-8 encoding of character 0x00"
702 + Integer.toHexString (c)
703 + " can't be converted to Unicode."
704 );
705
706 else if (c > 0xffff) {
707
708 c -= 0x10000;
709 nextChar = (char) (0xDC00 + (c & 0x03ff));
710 c = 0xD800 + (c >> 10);
711 }
712
713
714 } else
715 throw new CharConversionException (
716 "Unconvertible UTF-8 character"
717 + " beginning with 0x"
718 + Integer.toHexString (
719 buffer [start] & 0xff)
720 );
721
722 } catch (ArrayIndexOutOfBoundsException e) {
723
724 c = 0;
725 }
726
727
728
729
730
731
732
733 if (off > finish) {
734 System.arraycopy (buffer, start,
735 buffer, 0, finish - start);
736 finish -= start;
737 start = 0;
738 off = instream.read (buffer, finish,
739 buffer.length - finish);
740 if (off < 0) {
741 this.close ();
742 throw new CharConversionException (
743 "Partial UTF-8 char");
744 }
745 finish += off;
746 continue;
747 }
748
749
750
751
752 for (start++; start < off; start++) {
753 if ((buffer [start] & 0xC0) != 0x80) {
754 this.close ();
755 throw new CharConversionException (
756 "Malformed UTF-8 char -- "
757 + "is an XML encoding declaration missing?"
758 );
759 }
760 }
761
762
763
764
765 buf [offset + i++] = (char) c;
766 if (nextChar != 0 && i < len) {
767 buf [offset + i++] = nextChar;
768 nextChar = 0;
769 }
770 }
771 if (i > 0)
772 return i;
773 return (c == -1) ? -1 : 0;
774 }
775 }
776
777
778
779
780
781
782
783
784
785 static final class AsciiReader extends BaseReader
786 {
787 AsciiReader (InputStream in) { super (in); }
788
789 public int read (char buf [], int offset, int len) throws IOException
790 {
791 int i, c;
792
793 if (instream == null)
794 return -1;
795
796
797
798 if ((offset + len) > buf.length || offset < 0)
799 throw new ArrayIndexOutOfBoundsException ();
800
801 for (i = 0; i < len; i++) {
802 if (start >= finish) {
803 start = 0;
804 finish = instream.read (buffer, 0, buffer.length);
805 if (finish <= 0) {
806 if (finish <= 0)
807 this.close ();
808 break;
809 }
810 }
811 c = buffer [start++];
812 if ((c & 0x80) != 0)
813 throw new CharConversionException (
814 "Illegal ASCII character, 0x"
815 + Integer.toHexString (c & 0xff)
816 );
817 buf [offset + i] = (char) c;
818 }
819 if (i == 0 && finish <= 0)
820 return -1;
821 return i;
822 }
823 }
824
825 static final class Iso8859_1Reader extends BaseReader
826 {
827 Iso8859_1Reader (InputStream in) { super (in); }
828
829 public int read (char buf [], int offset, int len) throws IOException
830 {
831 int i;
832
833 if (instream == null)
834 return -1;
835
836
837
838 if ((offset + len) > buf.length || offset < 0)
839 throw new ArrayIndexOutOfBoundsException ();
840
841 for (i = 0; i < len; i++) {
842 if (start >= finish) {
843 start = 0;
844 finish = instream.read (buffer, 0, buffer.length);
845 if (finish <= 0) {
846 if (finish <= 0)
847 this.close ();
848 break;
849 }
850 }
851 buf [offset + i] = (char) (0x0ff & buffer [start++]);
852 }
853 if (i == 0 && finish <= 0)
854 return -1;
855 return i;
856 }
857 }
858 }