388 lines
14 KiB
Java
388 lines
14 KiB
Java
/*
|
|
* Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation. Oracle designates this
|
|
* particular file as subject to the "Classpath" exception as provided
|
|
* by Oracle in the LICENSE file that accompanied this code.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*/
|
|
|
|
package com.sun.xml.internal.dtdparser;
|
|
|
|
|
|
/**
|
|
* Methods in this class are used to determine whether characters may
|
|
* appear in certain roles in XML documents. Such methods are used
|
|
* both to parse and to create such documents.
|
|
*
|
|
* @author David Brownell
|
|
* @version 1.1, 00/08/05
|
|
*/
|
|
public class XmlChars {
|
|
// can't construct instances
|
|
private XmlChars() {
|
|
}
|
|
|
|
/**
|
|
* Returns true if the argument, a UCS-4 character code, is valid in
|
|
* XML documents. Unicode characters fit into the low sixteen
|
|
* bits of a UCS-4 character, and pairs of Unicode <em>surrogate
|
|
* characters</em> can be combined to encode UCS-4 characters in
|
|
* documents containing only Unicode. (The <code>char</code> datatype
|
|
* in the Java Programming Language represents Unicode characters,
|
|
* including unpaired surrogates.)
|
|
* <p/>
|
|
* <P> In XML, UCS-4 characters can also be encoded by the use of
|
|
* <em>character references</em> such as <b>&#x12345678;</b>, which
|
|
* happens to refer to a character that is disallowed in XML documents.
|
|
* UCS-4 characters allowed in XML documents can be expressed with
|
|
* one or two Unicode characters.
|
|
*
|
|
* @param ucs4char The 32-bit UCS-4 character being tested.
|
|
*/
|
|
static public boolean isChar(int ucs4char) {
|
|
// [2] Char ::= #x0009 | #x000A | #x000D
|
|
// | [#x0020-#xD7FF]
|
|
// ... surrogates excluded!
|
|
// | [#xE000-#xFFFD]
|
|
// | [#x10000-#x10ffff]
|
|
return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
|
|
|| ucs4char == 0x000A || ucs4char == 0x0009
|
|
|| ucs4char == 0x000D
|
|
|| (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
|
|
|| (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
|
|
}
|
|
|
|
/**
|
|
* Returns true if the character is allowed to be a non-initial
|
|
* character in names according to the XML recommendation.
|
|
*
|
|
* @see #isNCNameChar(char)
|
|
* @see #isLetter(char)
|
|
*/
|
|
public static boolean isNameChar(char c) {
|
|
// [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
|
|
// | CombiningChar | Extender
|
|
|
|
if (isLetter2(c))
|
|
return true;
|
|
else if (c == '>')
|
|
return false;
|
|
else if (c == '.' || c == '-' || c == '_' || c == ':'
|
|
|| isExtender(c))
|
|
return true;
|
|
else
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Returns true if the character is allowed to be a non-initial
|
|
* character in unscoped names according to the rules of the XML
|
|
* Namespaces proposed recommendation. Except for precluding
|
|
* the colon (used to separate names from their scopes) these
|
|
* characters are just as allowed by the XML recommendation.
|
|
*
|
|
* @see #isNameChar(char)
|
|
* @see #isLetter(char)
|
|
*/
|
|
public static boolean isNCNameChar(char c) {
|
|
// [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
|
|
// | CombiningChar | Extender
|
|
return c != ':' && isNameChar(c);
|
|
}
|
|
|
|
/**
|
|
* Returns true if the character is allowed where XML supports
|
|
* whitespace characters, false otherwise.
|
|
*/
|
|
public static boolean isSpace(char c) {
|
|
return c == ' ' || c == '\t' || c == '\n' || c == '\r';
|
|
}
|
|
|
|
|
|
/*
|
|
* NOTE: java.lang.Character.getType() values are:
|
|
*
|
|
* UNASSIGNED = 0,
|
|
*
|
|
* UPPERCASE_LETTER = 1, // Lu
|
|
* LOWERCASE_LETTER = 2, // Ll
|
|
* TITLECASE_LETTER = 3, // Lt
|
|
* MODIFIER_LETTER = 4, // Lm
|
|
* OTHER_LETTER = 5, // Lo
|
|
* NON_SPACING_MARK = 6, // Mn
|
|
* ENCLOSING_MARK = 7, // Me
|
|
* COMBINING_SPACING_MARK = 8, // Mc
|
|
* DECIMAL_DIGIT_NUMBER = 9, // Nd
|
|
* LETTER_NUMBER = 10, // Nl
|
|
* OTHER_NUMBER = 11, // No
|
|
* SPACE_SEPARATOR = 12, // Zs
|
|
* LINE_SEPARATOR = 13, // Zl
|
|
* PARAGRAPH_SEPARATOR = 14, // Zp
|
|
* CONTROL = 15, // Cc
|
|
* FORMAT = 16, // Cf
|
|
* // 17 reserved for proposed Ci category
|
|
* PRIVATE_USE = 18, // Co
|
|
* SURROGATE = 19, // Cs
|
|
* DASH_PUNCTUATION = 20, // Pd
|
|
* START_PUNCTUATION = 21, // Ps
|
|
* END_PUNCTUATION = 22, // Pe
|
|
* CONNECTOR_PUNCTUATION = 23, // Pc
|
|
* OTHER_PUNCTUATION = 24, // Po
|
|
* MATH_SYMBOL = 25, // Sm
|
|
* CURRENCY_SYMBOL = 26, // Sc
|
|
* MODIFIER_SYMBOL = 27, // Sk
|
|
* OTHER_SYMBOL = 28; // So
|
|
*/
|
|
|
|
/**
|
|
* Returns true if the character is an XML "letter". XML Names must
|
|
* start with Letters or a few other characters, but other characters
|
|
* in names must only satisfy the <em>isNameChar</em> predicate.
|
|
*
|
|
* @see #isNameChar(char)
|
|
* @see #isNCNameChar(char)
|
|
*/
|
|
public static boolean isLetter(char c) {
|
|
// [84] Letter ::= BaseChar | Ideographic
|
|
// [85] BaseChar ::= ... too much to repeat
|
|
// [86] Ideographic ::= ... too much to repeat
|
|
|
|
//
|
|
// Optimize the typical case.
|
|
//
|
|
if (c >= 'a' && c <= 'z')
|
|
return true;
|
|
if (c == '/')
|
|
return false;
|
|
if (c >= 'A' && c <= 'Z')
|
|
return true;
|
|
|
|
//
|
|
// Since the tables are too ridiculous to use in code,
|
|
// we're using the footnotes here to drive this test.
|
|
//
|
|
switch (Character.getType(c)) {
|
|
// app. B footnote says these are 'name start'
|
|
// chars' ...
|
|
case Character.LOWERCASE_LETTER: // Ll
|
|
case Character.UPPERCASE_LETTER: // Lu
|
|
case Character.OTHER_LETTER: // Lo
|
|
case Character.TITLECASE_LETTER: // Lt
|
|
case Character.LETTER_NUMBER: // Nl
|
|
|
|
// OK, here we just have some exceptions to check...
|
|
return !isCompatibilityChar(c)
|
|
// per "5.14 of Unicode", rule out some combiners
|
|
&& !(c >= 0x20dd && c <= 0x20e0);
|
|
|
|
default:
|
|
// check for some exceptions: these are "alphabetic"
|
|
return ((c >= 0x02bb && c <= 0x02c1)
|
|
|| c == 0x0559 || c == 0x06e5 || c == 0x06e6);
|
|
}
|
|
}
|
|
|
|
//
|
|
// XML 1.0 discourages "compatibility" characters in names; these
|
|
// were defined to permit passing through some information stored in
|
|
// older non-Unicode character sets. These always have alternative
|
|
// representations in Unicode, e.g. using combining chars.
|
|
//
|
|
private static boolean isCompatibilityChar(char c) {
|
|
// the numerous comparisions here seem unavoidable,
|
|
// but the switch can reduce the number which must
|
|
// actually be executed.
|
|
|
|
switch ((c >> 8) & 0x0ff) {
|
|
case 0x00:
|
|
// ISO Latin/1 has a few compatibility characters
|
|
return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
|
|
|
|
case 0x01:
|
|
// as do Latin Extended A and (parts of) B
|
|
return (c >= 0x0132 && c <= 0x0133)
|
|
|| (c >= 0x013f && c <= 0x0140)
|
|
|| c == 0x0149
|
|
|| c == 0x017f
|
|
|| (c >= 0x01c4 && c <= 0x01cc)
|
|
|| (c >= 0x01f1 && c <= 0x01f3);
|
|
|
|
case 0x02:
|
|
// some spacing modifiers
|
|
return (c >= 0x02b0 && c <= 0x02b8)
|
|
|| (c >= 0x02e0 && c <= 0x02e4);
|
|
|
|
case 0x03:
|
|
return c == 0x037a; // Greek
|
|
|
|
case 0x05:
|
|
return c == 0x0587; // Armenian
|
|
|
|
case 0x0e:
|
|
return c >= 0x0edc && c <= 0x0edd; // Laotian
|
|
|
|
case 0x11:
|
|
// big chunks of Hangul Jamo are all "compatibility"
|
|
return c == 0x1101
|
|
|| c == 0x1104
|
|
|| c == 0x1108
|
|
|| c == 0x110a
|
|
|| c == 0x110d
|
|
|| (c >= 0x1113 && c <= 0x113b)
|
|
|| c == 0x113d
|
|
|| c == 0x113f
|
|
|| (c >= 0x1141 && c <= 0x114b)
|
|
|| c == 0x114d
|
|
|| c == 0x114f
|
|
|| (c >= 0x1151 && c <= 0x1153)
|
|
|| (c >= 0x1156 && c <= 0x1158)
|
|
|| c == 0x1162
|
|
|| c == 0x1164
|
|
|| c == 0x1166
|
|
|| c == 0x1168
|
|
|| (c >= 0x116a && c <= 0x116c)
|
|
|| (c >= 0x116f && c <= 0x1171)
|
|
|| c == 0x1174
|
|
|| (c >= 0x1176 && c <= 0x119d)
|
|
|| (c >= 0x119f && c <= 0x11a2)
|
|
|| (c >= 0x11a9 && c <= 0x11aa)
|
|
|| (c >= 0x11ac && c <= 0x11ad)
|
|
|| (c >= 0x11b0 && c <= 0x11b6)
|
|
|| c == 0x11b9
|
|
|| c == 0x11bb
|
|
|| (c >= 0x11c3 && c <= 0x11ea)
|
|
|| (c >= 0x11ec && c <= 0x11ef)
|
|
|| (c >= 0x11f1 && c <= 0x11f8)
|
|
;
|
|
|
|
case 0x20:
|
|
return c == 0x207f; // superscript
|
|
|
|
case 0x21:
|
|
return
|
|
// various letterlike symbols
|
|
c == 0x2102
|
|
|| c == 0x2107
|
|
|| (c >= 0x210a && c <= 0x2113)
|
|
|| c == 0x2115
|
|
|| (c >= 0x2118 && c <= 0x211d)
|
|
|| c == 0x2124
|
|
|| c == 0x2128
|
|
|| (c >= 0x212c && c <= 0x212d)
|
|
|| (c >= 0x212f && c <= 0x2138)
|
|
|
|
// most Roman numerals (less 1K, 5K, 10K)
|
|
|| (c >= 0x2160 && c <= 0x217f)
|
|
;
|
|
|
|
case 0x30:
|
|
// some Hiragana
|
|
return c >= 0x309b && c <= 0x309c;
|
|
|
|
case 0x31:
|
|
// all Hangul Compatibility Jamo
|
|
return c >= 0x3131 && c <= 0x318e;
|
|
|
|
case 0xf9:
|
|
case 0xfa:
|
|
case 0xfb:
|
|
case 0xfc:
|
|
case 0xfd:
|
|
case 0xfe:
|
|
case 0xff:
|
|
// the whole "compatibility" area is for that purpose!
|
|
return true;
|
|
|
|
default:
|
|
// most of Unicode isn't flagged as being for compatibility
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// guts of isNameChar/isNCNameChar
|
|
private static boolean isLetter2(char c) {
|
|
// [84] Letter ::= BaseChar | Ideographic
|
|
// [85] BaseChar ::= ... too much to repeat
|
|
// [86] Ideographic ::= ... too much to repeat
|
|
// [87] CombiningChar ::= ... too much to repeat
|
|
|
|
//
|
|
// Optimize the typical case.
|
|
//
|
|
if (c >= 'a' && c <= 'z')
|
|
return true;
|
|
if (c == '>')
|
|
return false;
|
|
if (c >= 'A' && c <= 'Z')
|
|
return true;
|
|
|
|
//
|
|
// Since the tables are too ridiculous to use in code,
|
|
// we're using the footnotes here to drive this test.
|
|
//
|
|
switch (Character.getType(c)) {
|
|
// app. B footnote says these are 'name start'
|
|
// chars' ...
|
|
case Character.LOWERCASE_LETTER: // Ll
|
|
case Character.UPPERCASE_LETTER: // Lu
|
|
case Character.OTHER_LETTER: // Lo
|
|
case Character.TITLECASE_LETTER: // Lt
|
|
case Character.LETTER_NUMBER: // Nl
|
|
// ... and these are name characters 'other
|
|
// than name start characters'
|
|
case Character.COMBINING_SPACING_MARK: // Mc
|
|
case Character.ENCLOSING_MARK: // Me
|
|
case Character.NON_SPACING_MARK: // Mn
|
|
case Character.MODIFIER_LETTER: // Lm
|
|
case Character.DECIMAL_DIGIT_NUMBER: // Nd
|
|
|
|
// OK, here we just have some exceptions to check...
|
|
return !isCompatibilityChar(c)
|
|
// per "5.14 of Unicode", rule out some combiners
|
|
&& !(c >= 0x20dd && c <= 0x20e0);
|
|
|
|
default:
|
|
// added a character ...
|
|
return c == 0x0387;
|
|
}
|
|
}
|
|
|
|
private static boolean isDigit(char c) {
|
|
// [88] Digit ::= ...
|
|
|
|
//
|
|
// java.lang.Character.isDigit is correct from the XML point
|
|
// of view except that it allows "fullwidth" digits.
|
|
//
|
|
return Character.isDigit(c)
|
|
&& !((c >= 0xff10) && (c <= 0xff19));
|
|
}
|
|
|
|
private static boolean isExtender(char c) {
|
|
// [89] Extender ::= ...
|
|
return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
|
|
|| c == 0x0640 || c == 0x0e46 || c == 0x0ec6
|
|
|| c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
|
|
|| (c >= 0x309d && c <= 0x309e)
|
|
|| (c >= 0x30fc && c <= 0x30fe)
|
|
;
|
|
}
|
|
}
|