785 lines
27 KiB
Java
785 lines
27 KiB
Java
/*
|
|
* Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation. Oracle designates this
|
|
* particular file as subject to the "Classpath" exception as provided
|
|
* by Oracle in the LICENSE file that accompanied this code.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*/
|
|
|
|
package com.sun.xml.internal.dtdparser;
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.CharConversionException;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.PushbackInputStream;
|
|
import java.io.Reader;
|
|
import java.util.Hashtable;
|
|
|
|
|
|
// NOTE: Add I18N support to this class when JDK gets the ability to
|
|
// defer selection of locale for exception messages ... use the same
|
|
// technique for both.
|
|
|
|
|
|
/**
|
|
* This handles several XML-related tasks that normal java.io Readers
|
|
* don't support, inluding use of IETF standard encoding names and
|
|
* automatic detection of most XML encodings. The former is needed
|
|
* for interoperability; the latter is needed to conform with the XML
|
|
* spec. This class also optimizes reading some common encodings by
|
|
* providing low-overhead unsynchronized Reader support.
|
|
* <p/>
|
|
* <P> Note that the autodetection facility should be used only on
|
|
* data streams which have an unknown character encoding. For example,
|
|
* it should never be used on MIME text/xml entities.
|
|
* <p/>
|
|
* <P> Note that XML processors are only required to support UTF-8 and
|
|
* UTF-16 character encodings. Autodetection permits the underlying Java
|
|
* implementation to provide support for many other encodings, such as
|
|
* US-ASCII, ISO-8859-5, Shift_JIS, EUC-JP, and ISO-2022-JP.
|
|
*
|
|
* @author David Brownell
|
|
* @author Janet Koenig
|
|
* @version 1.3 00/02/24
|
|
*/
|
|
// package private
|
|
final class XmlReader extends Reader {
|
|
private static final int MAXPUSHBACK = 512;
|
|
|
|
private Reader in;
|
|
private String assignedEncoding;
|
|
private boolean closed;
|
|
|
|
//
|
|
// This class always delegates I/O to a reader, which gets
|
|
// its data from the very beginning of the XML text. It needs
|
|
// to use a pushback stream since (a) autodetection can read
|
|
// partial UTF-8 characters which need to be fully processed,
|
|
// (b) the "Unicode" readers swallow characters that they think
|
|
// are byte order marks, so tests fail if they don't see the
|
|
// real byte order mark.
|
|
//
|
|
// It's got do this efficiently: character I/O is solidly on the
|
|
// critical path. (So keep buffer length over 2 Kbytes to avoid
|
|
// excess buffering. Many URL handlers stuff a BufferedInputStream
|
|
// between here and the real data source, and larger buffers keep
|
|
// that from slowing you down.)
|
|
//
|
|
|
|
/**
|
|
* Constructs the reader from an input stream, autodetecting
|
|
* the encoding to use according to the heuristic specified
|
|
* in the XML 1.0 recommendation.
|
|
*
|
|
* @param in the input stream from which the reader is constructed
|
|
* @throws IOException on error, such as unrecognized encoding
|
|
*/
|
|
public static Reader createReader(InputStream in) throws IOException {
|
|
return new XmlReader(in);
|
|
}
|
|
|
|
/**
|
|
* Creates a reader supporting the given encoding, mapping
|
|
* from standard encoding names to ones that understood by
|
|
* Java where necessary.
|
|
*
|
|
* @param in the input stream from which the reader is constructed
|
|
* @param encoding the IETF standard name of the encoding to use;
|
|
* if null, autodetection is used.
|
|
* @throws IOException on error, including unrecognized encoding
|
|
*/
|
|
public static Reader createReader(InputStream in, String encoding)
|
|
throws IOException {
|
|
if (encoding == null)
|
|
return new XmlReader(in);
|
|
if ("UTF-8".equalsIgnoreCase(encoding)
|
|
|| "UTF8".equalsIgnoreCase(encoding))
|
|
return new Utf8Reader(in);
|
|
if ("US-ASCII".equalsIgnoreCase(encoding)
|
|
|| "ASCII".equalsIgnoreCase(encoding))
|
|
return new AsciiReader(in);
|
|
if ("ISO-8859-1".equalsIgnoreCase(encoding)
|
|
// plus numerous aliases ...
|
|
)
|
|
return new Iso8859_1Reader(in);
|
|
|
|
//
|
|
// What we really want is an administerable resource mapping
|
|
// encoding names/aliases to classnames. For example a property
|
|
// file resource, "readers/mapping.props", holding and a set
|
|
// of readers in that (sub)package... defaulting to this call
|
|
// only if no better choice is available.
|
|
//
|
|
return new InputStreamReader(in, std2java(encoding));
|
|
}
|
|
|
|
//
|
|
// JDK doesn't know all of the standard encoding names, and
|
|
// in particular none of the EBCDIC ones IANA defines (and
|
|
// which IBM encourages).
|
|
//
|
|
static private final Hashtable charsets = new Hashtable(31);
|
|
|
|
static {
|
|
charsets.put("UTF-16", "Unicode");
|
|
charsets.put("ISO-10646-UCS-2", "Unicode");
|
|
|
|
// NOTE: no support for ISO-10646-UCS-4 yet.
|
|
|
|
charsets.put("EBCDIC-CP-US", "cp037");
|
|
charsets.put("EBCDIC-CP-CA", "cp037");
|
|
charsets.put("EBCDIC-CP-NL", "cp037");
|
|
charsets.put("EBCDIC-CP-WT", "cp037");
|
|
|
|
charsets.put("EBCDIC-CP-DK", "cp277");
|
|
charsets.put("EBCDIC-CP-NO", "cp277");
|
|
charsets.put("EBCDIC-CP-FI", "cp278");
|
|
charsets.put("EBCDIC-CP-SE", "cp278");
|
|
|
|
charsets.put("EBCDIC-CP-IT", "cp280");
|
|
charsets.put("EBCDIC-CP-ES", "cp284");
|
|
charsets.put("EBCDIC-CP-GB", "cp285");
|
|
charsets.put("EBCDIC-CP-FR", "cp297");
|
|
|
|
charsets.put("EBCDIC-CP-AR1", "cp420");
|
|
charsets.put("EBCDIC-CP-HE", "cp424");
|
|
charsets.put("EBCDIC-CP-BE", "cp500");
|
|
charsets.put("EBCDIC-CP-CH", "cp500");
|
|
|
|
charsets.put("EBCDIC-CP-ROECE", "cp870");
|
|
charsets.put("EBCDIC-CP-YU", "cp870");
|
|
charsets.put("EBCDIC-CP-IS", "cp871");
|
|
charsets.put("EBCDIC-CP-AR2", "cp918");
|
|
|
|
// IANA also defines two that JDK 1.2 doesn't handle:
|
|
// EBCDIC-CP-GR --> CP423
|
|
// EBCDIC-CP-TR --> CP905
|
|
}
|
|
|
|
// returns an encoding name supported by JDK >= 1.1.6
|
|
// for some cases required by the XML spec
|
|
private static String std2java(String encoding) {
|
|
String temp = encoding.toUpperCase();
|
|
temp = (String) charsets.get(temp);
|
|
return temp != null ? temp : encoding;
|
|
}
|
|
|
|
/**
|
|
* Returns the standard name of the encoding in use
|
|
*/
|
|
public String getEncoding() {
|
|
return assignedEncoding;
|
|
}
|
|
|
|
private XmlReader(InputStream stream) throws IOException {
|
|
super(stream);
|
|
|
|
PushbackInputStream pb;
|
|
byte buf [];
|
|
int len;
|
|
|
|
if (stream instanceof PushbackInputStream)
|
|
pb = (PushbackInputStream) stream;
|
|
else
|
|
pb = new PushbackInputStream(stream, MAXPUSHBACK);
|
|
|
|
//
|
|
// See if we can figure out the character encoding used
|
|
// in this file by peeking at the first few bytes.
|
|
//
|
|
buf = new byte[4];
|
|
len = pb.read(buf);
|
|
if (len > 0)
|
|
pb.unread(buf, 0, len);
|
|
|
|
if (len == 4)
|
|
switch (buf[0] & 0x0ff) {
|
|
case 0:
|
|
// 00 3c 00 3f == illegal UTF-16 big-endian
|
|
if (buf[1] == 0x3c && buf[2] == 0x00 && buf[3] == 0x3f) {
|
|
setEncoding(pb, "UnicodeBig");
|
|
return;
|
|
}
|
|
// else it's probably UCS-4
|
|
break;
|
|
|
|
case '<': // 0x3c: the most common cases!
|
|
switch (buf[1] & 0x0ff) {
|
|
// First character is '<'; could be XML without
|
|
// an XML directive such as "<hello>", "<!-- ...",
|
|
// and so on.
|
|
default:
|
|
break;
|
|
|
|
// 3c 00 3f 00 == illegal UTF-16 little endian
|
|
case 0x00:
|
|
if (buf[2] == 0x3f && buf[3] == 0x00) {
|
|
setEncoding(pb, "UnicodeLittle");
|
|
return;
|
|
}
|
|
// else probably UCS-4
|
|
break;
|
|
|
|
// 3c 3f 78 6d == ASCII and supersets '<?xm'
|
|
case '?':
|
|
if (buf[2] != 'x' || buf[3] != 'm')
|
|
break;
|
|
//
|
|
// One of several encodings could be used:
|
|
// Shift-JIS, ASCII, UTF-8, ISO-8859-*, etc
|
|
//
|
|
useEncodingDecl(pb, "UTF8");
|
|
return;
|
|
}
|
|
break;
|
|
|
|
// 4c 6f a7 94 ... some EBCDIC code page
|
|
case 0x4c:
|
|
if (buf[1] == 0x6f
|
|
&& (0x0ff & buf[2]) == 0x0a7
|
|
&& (0x0ff & buf[3]) == 0x094) {
|
|
useEncodingDecl(pb, "CP037");
|
|
return;
|
|
}
|
|
// whoops, treat as UTF-8
|
|
break;
|
|
|
|
// UTF-16 big-endian
|
|
case 0xfe:
|
|
if ((buf[1] & 0x0ff) != 0xff)
|
|
break;
|
|
setEncoding(pb, "UTF-16");
|
|
return;
|
|
|
|
// UTF-16 little-endian
|
|
case 0xff:
|
|
if ((buf[1] & 0x0ff) != 0xfe)
|
|
break;
|
|
setEncoding(pb, "UTF-16");
|
|
return;
|
|
|
|
// default ... no XML declaration
|
|
default:
|
|
break;
|
|
}
|
|
|
|
//
|
|
// If all else fails, assume XML without a declaration, and
|
|
// using UTF-8 encoding.
|
|
//
|
|
setEncoding(pb, "UTF-8");
|
|
}
|
|
|
|
/*
|
|
* Read the encoding decl on the stream, knowing that it should
|
|
* be readable using the specified encoding (basically, ASCII or
|
|
* EBCDIC). The body of the document may use a wider range of
|
|
* characters than the XML/Text decl itself, so we switch to use
|
|
* the specified encoding as soon as we can. (ASCII is a subset
|
|
* of UTF-8, ISO-8859-*, ISO-2022-JP, EUC-JP, and more; EBCDIC
|
|
* has a variety of "code pages" that have these characters as
|
|
* a common subset.)
|
|
*/
|
|
private void useEncodingDecl(PushbackInputStream pb, String encoding)
|
|
throws IOException {
|
|
byte buffer [] = new byte[MAXPUSHBACK];
|
|
int len;
|
|
Reader r;
|
|
int c;
|
|
|
|
//
|
|
// Buffer up a bunch of input, and set up to read it in
|
|
// the specified encoding ... we can skip the first four
|
|
// bytes since we know that "<?xm" was read to determine
|
|
// what encoding to use!
|
|
//
|
|
len = pb.read(buffer, 0, buffer.length);
|
|
pb.unread(buffer, 0, len);
|
|
r = new InputStreamReader(new ByteArrayInputStream(buffer, 4, len),
|
|
encoding);
|
|
|
|
//
|
|
// Next must be "l" (and whitespace) else we conclude
|
|
// error and choose UTF-8.
|
|
//
|
|
if ((c = r.read()) != 'l') {
|
|
setEncoding(pb, "UTF-8");
|
|
return;
|
|
}
|
|
|
|
//
|
|
// Then, we'll skip any
|
|
// S version="..." [or single quotes]
|
|
// bit and get any subsequent
|
|
// S encoding="..." [or single quotes]
|
|
//
|
|
// We put an arbitrary size limit on how far we read; lots
|
|
// of space will break this algorithm.
|
|
//
|
|
StringBuffer buf = new StringBuffer();
|
|
StringBuffer keyBuf = null;
|
|
String key = null;
|
|
boolean sawEq = false;
|
|
char quoteChar = 0;
|
|
boolean sawQuestion = false;
|
|
|
|
XmlDecl:
|
|
for (int i = 0; i < MAXPUSHBACK - 5; ++i) {
|
|
if ((c = r.read()) == -1)
|
|
break;
|
|
|
|
// ignore whitespace before/between "key = 'value'"
|
|
if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
|
|
continue;
|
|
|
|
// ... but require at least a little!
|
|
if (i == 0)
|
|
break;
|
|
|
|
// terminate the loop ASAP
|
|
if (c == '?')
|
|
sawQuestion = true;
|
|
else if (sawQuestion) {
|
|
if (c == '>')
|
|
break;
|
|
sawQuestion = false;
|
|
}
|
|
|
|
// did we get the "key =" bit yet?
|
|
if (key == null || !sawEq) {
|
|
if (keyBuf == null) {
|
|
if (Character.isWhitespace((char) c))
|
|
continue;
|
|
keyBuf = buf;
|
|
buf.setLength(0);
|
|
buf.append((char) c);
|
|
sawEq = false;
|
|
} else if (Character.isWhitespace((char) c)) {
|
|
key = keyBuf.toString();
|
|
} else if (c == '=') {
|
|
if (key == null)
|
|
key = keyBuf.toString();
|
|
sawEq = true;
|
|
keyBuf = null;
|
|
quoteChar = 0;
|
|
} else
|
|
keyBuf.append((char) c);
|
|
continue;
|
|
}
|
|
|
|
// space before quoted value
|
|
if (Character.isWhitespace((char) c))
|
|
continue;
|
|
if (c == '"' || c == '\'') {
|
|
if (quoteChar == 0) {
|
|
quoteChar = (char) c;
|
|
buf.setLength(0);
|
|
continue;
|
|
} else if (c == quoteChar) {
|
|
if ("encoding".equals(key)) {
|
|
assignedEncoding = buf.toString();
|
|
|
|
// [81] Encname ::= [A-Za-z] ([A-Za-z0-9._]|'-')*
|
|
for (i = 0; i < assignedEncoding.length(); i++) {
|
|
c = assignedEncoding.charAt(i);
|
|
if ((c >= 'A' && c <= 'Z')
|
|
|| (c >= 'a' && c <= 'z'))
|
|
continue;
|
|
if (i == 0)
|
|
break XmlDecl;
|
|
if (i > 0 && (c == '-'
|
|
|| (c >= '0' && c <= '9')
|
|
|| c == '.' || c == '_'))
|
|
continue;
|
|
// map illegal names to UTF-8 default
|
|
break XmlDecl;
|
|
}
|
|
|
|
setEncoding(pb, assignedEncoding);
|
|
return;
|
|
|
|
} else {
|
|
key = null;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
buf.append((char) c);
|
|
}
|
|
|
|
setEncoding(pb, "UTF-8");
|
|
}
|
|
|
|
private void setEncoding(InputStream stream, String encoding)
|
|
throws IOException {
|
|
assignedEncoding = encoding;
|
|
in = createReader(stream, encoding);
|
|
}
|
|
|
|
/**
|
|
* Reads the number of characters read into the buffer, or -1 on EOF.
|
|
*/
|
|
public int read(char buf [], int off, int len) throws IOException {
|
|
int val;
|
|
|
|
if (closed)
|
|
return -1; // throw new IOException ("closed");
|
|
val = in.read(buf, off, len);
|
|
if (val == -1)
|
|
close();
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Reads a single character.
|
|
*/
|
|
public int read() throws IOException {
|
|
int val;
|
|
|
|
if (closed)
|
|
throw new IOException("closed");
|
|
val = in.read();
|
|
if (val == -1)
|
|
close();
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Returns true iff the reader supports mark/reset.
|
|
*/
|
|
public boolean markSupported() {
|
|
return in == null ? false : in.markSupported();
|
|
}
|
|
|
|
/**
|
|
* Sets a mark allowing a limited number of characters to
|
|
* be "peeked", by reading and then resetting.
|
|
*
|
|
* @param value how many characters may be "peeked".
|
|
*/
|
|
public void mark(int value) throws IOException {
|
|
if (in != null) in.mark(value);
|
|
}
|
|
|
|
/**
|
|
* Resets the current position to the last marked position.
|
|
*/
|
|
public void reset() throws IOException {
|
|
if (in != null) in.reset();
|
|
}
|
|
|
|
/**
|
|
* Skips a specified number of characters.
|
|
*/
|
|
public long skip(long value) throws IOException {
|
|
return in == null ? 0 : in.skip(value);
|
|
}
|
|
|
|
/**
|
|
* Returns true iff input characters are known to be ready.
|
|
*/
|
|
public boolean ready() throws IOException {
|
|
return in == null ? false : in.ready();
|
|
}
|
|
|
|
/**
|
|
* Closes the reader.
|
|
*/
|
|
public void close() throws IOException {
|
|
if (closed)
|
|
return;
|
|
in.close();
|
|
in = null;
|
|
closed = true;
|
|
}
|
|
|
|
//
|
|
// Delegating to a converter module will always be slower than
|
|
// direct conversion. Use a similar approach for any other
|
|
// readers that need to be particularly fast; only block I/O
|
|
// speed matters to this package. For UTF-16, separate readers
|
|
// for big and little endian streams make a difference, too;
|
|
// fewer conditionals in the critical path!
|
|
//
|
|
static abstract class BaseReader extends Reader {
|
|
protected InputStream instream;
|
|
protected byte buffer [];
|
|
protected int start, finish;
|
|
|
|
BaseReader(InputStream stream) {
|
|
super(stream);
|
|
|
|
instream = stream;
|
|
buffer = new byte[8192];
|
|
}
|
|
|
|
public boolean ready() throws IOException {
|
|
return instream == null
|
|
|| (finish - start) > 0
|
|
|| instream.available() != 0;
|
|
}
|
|
|
|
// caller shouldn't read again
|
|
public void close() throws IOException {
|
|
if (instream != null) {
|
|
instream.close();
|
|
start = finish = 0;
|
|
buffer = null;
|
|
instream = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// We want this reader, to make the default encoding be as fast
|
|
// as we can make it. JDK's "UTF8" (not "UTF-8" till JDK 1.2)
|
|
// InputStreamReader works, but 20+% slower speed isn't OK for
|
|
// the default/primary encoding.
|
|
//
|
|
static final class Utf8Reader extends BaseReader {
|
|
// 2nd half of UTF-8 surrogate pair
|
|
private char nextChar;
|
|
|
|
Utf8Reader(InputStream stream) {
|
|
super(stream);
|
|
}
|
|
|
|
public int read(char buf [], int offset, int len) throws IOException {
|
|
int i = 0, c = 0;
|
|
|
|
if (len <= 0)
|
|
return 0;
|
|
|
|
// Consume remaining half of any surrogate pair immediately
|
|
if (nextChar != 0) {
|
|
buf[offset + i++] = nextChar;
|
|
nextChar = 0;
|
|
}
|
|
|
|
while (i < len) {
|
|
// stop or read data if needed
|
|
if (finish <= start) {
|
|
if (instream == null) {
|
|
c = -1;
|
|
break;
|
|
}
|
|
start = 0;
|
|
finish = instream.read(buffer, 0, buffer.length);
|
|
if (finish <= 0) {
|
|
this.close();
|
|
c = -1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// RFC 2279 describes UTF-8; there are six encodings.
|
|
// Each encoding takes a fixed number of characters
|
|
// (1-6 bytes) and is flagged by a bit pattern in the
|
|
// first byte. The five and six byte-per-character
|
|
// encodings address characters which are disallowed
|
|
// in XML documents, as do some four byte ones.
|
|
//
|
|
|
|
//
|
|
// Single byte == ASCII. Common; optimize.
|
|
//
|
|
c = buffer[start] & 0x0ff;
|
|
if ((c & 0x80) == 0x00) {
|
|
// 0x0000 <= c <= 0x007f
|
|
start++;
|
|
buf[offset + i++] = (char) c;
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// Multibyte chars -- check offsets optimistically,
|
|
// ditto the "10xx xxxx" format for subsequent bytes
|
|
//
|
|
int off = start;
|
|
|
|
try {
|
|
// 2 bytes
|
|
if ((buffer[off] & 0x0E0) == 0x0C0) {
|
|
c = (buffer[off++] & 0x1f) << 6;
|
|
c += buffer[off++] & 0x3f;
|
|
|
|
// 0x0080 <= c <= 0x07ff
|
|
|
|
// 3 bytes
|
|
} else if ((buffer[off] & 0x0F0) == 0x0E0) {
|
|
c = (buffer[off++] & 0x0f) << 12;
|
|
c += (buffer[off++] & 0x3f) << 6;
|
|
c += buffer[off++] & 0x3f;
|
|
|
|
// 0x0800 <= c <= 0xffff
|
|
|
|
// 4 bytes
|
|
} else if ((buffer[off] & 0x0f8) == 0x0F0) {
|
|
c = (buffer[off++] & 0x07) << 18;
|
|
c += (buffer[off++] & 0x3f) << 12;
|
|
c += (buffer[off++] & 0x3f) << 6;
|
|
c += buffer[off++] & 0x3f;
|
|
|
|
// 0x0001 0000 <= c <= 0x001f ffff
|
|
|
|
// Unicode supports c <= 0x0010 ffff ...
|
|
if (c > 0x0010ffff)
|
|
throw new CharConversionException("UTF-8 encoding of character 0x00"
|
|
+ Integer.toHexString(c)
|
|
+ " can't be converted to Unicode.");
|
|
|
|
// Convert UCS-4 char to surrogate pair (UTF-16)
|
|
c -= 0x10000;
|
|
nextChar = (char) (0xDC00 + (c & 0x03ff));
|
|
c = 0xD800 + (c >> 10);
|
|
|
|
// 5 and 6 byte versions are XML WF errors, but
|
|
// typically come from mislabeled encodings
|
|
} else
|
|
throw new CharConversionException("Unconvertible UTF-8 character"
|
|
+ " beginning with 0x"
|
|
+ Integer.toHexString(buffer[start] & 0xff));
|
|
|
|
} catch (ArrayIndexOutOfBoundsException e) {
|
|
// off > length && length >= buffer.length
|
|
c = 0;
|
|
}
|
|
|
|
//
|
|
// if the buffer held only a partial character,
|
|
// compact it and try to read the rest of the
|
|
// character. worst case involves three
|
|
// single-byte reads -- quite rare.
|
|
//
|
|
if (off > finish) {
|
|
System.arraycopy(buffer, start,
|
|
buffer, 0, finish - start);
|
|
finish -= start;
|
|
start = 0;
|
|
off = instream.read(buffer, finish,
|
|
buffer.length - finish);
|
|
if (off < 0) {
|
|
this.close();
|
|
throw new CharConversionException("Partial UTF-8 char");
|
|
}
|
|
finish += off;
|
|
continue;
|
|
}
|
|
|
|
//
|
|
// check the format of the non-initial bytes
|
|
//
|
|
for (start++; start < off; start++) {
|
|
if ((buffer[start] & 0xC0) != 0x80) {
|
|
this.close();
|
|
throw new CharConversionException("Malformed UTF-8 char -- "
|
|
+ "is an XML encoding declaration missing?");
|
|
}
|
|
}
|
|
|
|
//
|
|
// If this needed a surrogate pair, consume ASAP
|
|
//
|
|
buf[offset + i++] = (char) c;
|
|
if (nextChar != 0 && i < len) {
|
|
buf[offset + i++] = nextChar;
|
|
nextChar = 0;
|
|
}
|
|
}
|
|
if (i > 0)
|
|
return i;
|
|
return (c == -1) ? -1 : 0;
|
|
}
|
|
}
|
|
|
|
//
|
|
// We want ASCII and ISO-8859 Readers since they're the most common
|
|
// encodings in the US and Europe, and we don't want performance
|
|
// regressions for them. They're also easy to implement efficiently,
|
|
// since they're bitmask subsets of UNICODE.
|
|
//
|
|
// XXX haven't benchmarked these readers vs what we get out of JDK.
|
|
//
|
|
static final class AsciiReader extends BaseReader {
|
|
AsciiReader(InputStream in) {
|
|
super(in);
|
|
}
|
|
|
|
public int read(char buf [], int offset, int len) throws IOException {
|
|
int i, c;
|
|
|
|
if (instream == null)
|
|
return -1;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (start >= finish) {
|
|
start = 0;
|
|
finish = instream.read(buffer, 0, buffer.length);
|
|
if (finish <= 0) {
|
|
if (finish <= 0)
|
|
this.close();
|
|
break;
|
|
}
|
|
}
|
|
c = buffer[start++];
|
|
if ((c & 0x80) != 0)
|
|
throw new CharConversionException("Illegal ASCII character, 0x"
|
|
+ Integer.toHexString(c & 0xff));
|
|
buf[offset + i] = (char) c;
|
|
}
|
|
if (i == 0 && finish <= 0)
|
|
return -1;
|
|
return i;
|
|
}
|
|
}
|
|
|
|
static final class Iso8859_1Reader extends BaseReader {
|
|
Iso8859_1Reader(InputStream in) {
|
|
super(in);
|
|
}
|
|
|
|
public int read(char buf [], int offset, int len) throws IOException {
|
|
int i;
|
|
|
|
if (instream == null)
|
|
return -1;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (start >= finish) {
|
|
start = 0;
|
|
finish = instream.read(buffer, 0, buffer.length);
|
|
if (finish <= 0) {
|
|
if (finish <= 0)
|
|
this.close();
|
|
break;
|
|
}
|
|
}
|
|
buf[offset + i] = (char) (0x0ff & buffer[start++]);
|
|
}
|
|
if (i == 0 && finish <= 0)
|
|
return -1;
|
|
return i;
|
|
}
|
|
}
|
|
}
|