feat(jdk8): move files to new folder to avoid resources compiled.
This commit is contained in:
349
jdkSrc/jdk8/sun/text/normalizer/CharTrie.java
Normal file
349
jdkSrc/jdk8/sun/text/normalizer/CharTrie.java
Normal file
@@ -0,0 +1,349 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Trie implementation which stores data in char, 16 bits.
|
||||
* @author synwee
|
||||
* @see com.ibm.icu.impl.Trie
|
||||
* @since release 2.1, Jan 01 2002
|
||||
*/
|
||||
|
||||
// note that i need to handle the block calculations later, since chartrie
|
||||
// in icu4c uses the same index array.
|
||||
public class CharTrie extends Trie
|
||||
{
|
||||
// public constructors ---------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Creates a new Trie with the settings for the trie data.</p>
|
||||
* <p>Unserialize the 32-bit-aligned input stream and use the data for the
|
||||
* trie.</p>
|
||||
* @param inputStream file input stream to a ICU data file, containing
|
||||
* the trie
|
||||
* @param dataManipulate object which provides methods to parse the char
|
||||
* data
|
||||
* @throws IOException thrown when data reading fails
|
||||
* @draft 2.1
|
||||
*/
|
||||
public CharTrie(InputStream inputStream,
|
||||
DataManipulate dataManipulate) throws IOException
|
||||
{
|
||||
super(inputStream, dataManipulate);
|
||||
|
||||
if (!isCharTrie()) {
|
||||
throw new IllegalArgumentException(
|
||||
"Data given does not belong to a char trie.");
|
||||
}
|
||||
m_friendAgent_ = new FriendAgent();
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a dummy CharTrie.
|
||||
* A dummy trie is an empty runtime trie, used when a real data trie cannot
|
||||
* be loaded.
|
||||
*
|
||||
* The trie always returns the initialValue,
|
||||
* or the leadUnitValue for lead surrogate code points.
|
||||
* The Latin-1 part is always set up to be linear.
|
||||
*
|
||||
* @param initialValue the initial value that is set for all code points
|
||||
* @param leadUnitValue the value for lead surrogate code _units_ that do not
|
||||
* have associated supplementary data
|
||||
* @param dataManipulate object which provides methods to parse the char data
|
||||
*/
|
||||
public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
|
||||
super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
|
||||
|
||||
int dataLength, latin1Length, i, limit;
|
||||
char block;
|
||||
|
||||
/* calculate the actual size of the dummy trie data */
|
||||
|
||||
/* max(Latin-1, block 0) */
|
||||
dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
|
||||
if(leadUnitValue!=initialValue) {
|
||||
dataLength+=DATA_BLOCK_LENGTH;
|
||||
}
|
||||
m_data_=new char[dataLength];
|
||||
m_dataLength_=dataLength;
|
||||
|
||||
m_initialValue_=(char)initialValue;
|
||||
|
||||
/* fill the index and data arrays */
|
||||
|
||||
/* indexes are preset to 0 (block 0) */
|
||||
|
||||
/* Latin-1 data */
|
||||
for(i=0; i<latin1Length; ++i) {
|
||||
m_data_[i]=(char)initialValue;
|
||||
}
|
||||
|
||||
if(leadUnitValue!=initialValue) {
|
||||
/* indexes for lead surrogate code units to the block after Latin-1 */
|
||||
block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
|
||||
i=0xd800>>INDEX_STAGE_1_SHIFT_;
|
||||
limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
|
||||
for(; i<limit; ++i) {
|
||||
m_index_[i]=block;
|
||||
}
|
||||
|
||||
/* data for lead surrogate code units */
|
||||
limit=latin1Length+DATA_BLOCK_LENGTH;
|
||||
for(i=latin1Length; i<limit; ++i) {
|
||||
m_data_[i]=(char)leadUnitValue;
|
||||
}
|
||||
}
|
||||
|
||||
m_friendAgent_ = new FriendAgent();
|
||||
}
|
||||
|
||||
/**
|
||||
* Java friend implementation
|
||||
*/
|
||||
public class FriendAgent
|
||||
{
|
||||
/**
|
||||
* Gives out the index array of the trie
|
||||
* @return index array of trie
|
||||
*/
|
||||
public char[] getPrivateIndex()
|
||||
{
|
||||
return m_index_;
|
||||
}
|
||||
/**
|
||||
* Gives out the data array of the trie
|
||||
* @return data array of trie
|
||||
*/
|
||||
public char[] getPrivateData()
|
||||
{
|
||||
return m_data_;
|
||||
}
|
||||
/**
|
||||
* Gives out the data offset in the trie
|
||||
* @return data offset in the trie
|
||||
*/
|
||||
public int getPrivateInitialValue()
|
||||
{
|
||||
return m_initialValue_;
|
||||
}
|
||||
}
|
||||
|
||||
// public methods --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Java friend implementation
|
||||
* To store the index and data array into the argument.
|
||||
* @param friend java friend UCharacterProperty object to store the array
|
||||
*/
|
||||
public void putIndexData(UCharacterProperty friend)
|
||||
{
|
||||
friend.setIndexData(m_friendAgent_);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value associated with the codepoint.
|
||||
* If no value is associated with the codepoint, a default value will be
|
||||
* returned.
|
||||
* @param ch codepoint
|
||||
* @return offset to data
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final char getCodePointValue(int ch)
|
||||
{
|
||||
int offset;
|
||||
|
||||
// fastpath for U+0000..U+D7FF
|
||||
if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
// copy of getRawOffset()
|
||||
offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
|
||||
+ (ch & INDEX_STAGE_3_MASK_);
|
||||
return m_data_[offset];
|
||||
}
|
||||
|
||||
// handle U+D800..U+10FFFF
|
||||
offset = getCodePointOffset(ch);
|
||||
|
||||
// return -1 if there is an error, in this case we return the default
|
||||
// value: m_initialValue_
|
||||
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value to the data which this lead surrogate character points
|
||||
* to.
|
||||
* Returned data may contain folding offset information for the next
|
||||
* trailing surrogate character.
|
||||
* This method does not guarantee correct results for trail surrogates.
|
||||
* @param ch lead surrogate character
|
||||
* @return data value
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final char getLeadValue(char ch)
|
||||
{
|
||||
return m_data_[getLeadOffset(ch)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value associated with a pair of surrogates.
|
||||
* @param lead a lead surrogate
|
||||
* @param trail a trail surrogate
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final char getSurrogateValue(char lead, char trail)
|
||||
{
|
||||
int offset = getSurrogateOffset(lead, trail);
|
||||
if (offset > 0) {
|
||||
return m_data_[offset];
|
||||
}
|
||||
return m_initialValue_;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get a value from a folding offset (from the value of a lead surrogate)
|
||||
* and a trail surrogate.</p>
|
||||
* <p>If the
|
||||
* @param leadvalue value associated with the lead surrogate which contains
|
||||
* the folding offset
|
||||
* @param trail surrogate
|
||||
* @return trie data value associated with the trail character
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final char getTrailValue(int leadvalue, char trail)
|
||||
{
|
||||
if (m_dataManipulate_ == null) {
|
||||
throw new NullPointerException(
|
||||
"The field DataManipulate in this Trie is null");
|
||||
}
|
||||
int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
|
||||
if (offset > 0) {
|
||||
return m_data_[getRawOffset(offset,
|
||||
(char)(trail & SURROGATE_MASK_))];
|
||||
}
|
||||
return m_initialValue_;
|
||||
}
|
||||
|
||||
// protected methods -----------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Parses the input stream and stores its trie content into a index and
|
||||
* data array</p>
|
||||
* @param inputStream data input stream containing trie data
|
||||
* @exception IOException thrown when data reading fails
|
||||
*/
|
||||
protected final void unserialize(InputStream inputStream)
|
||||
throws IOException
|
||||
{
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
int indexDataLength = m_dataOffset_ + m_dataLength_;
|
||||
m_index_ = new char[indexDataLength];
|
||||
for (int i = 0; i < indexDataLength; i ++) {
|
||||
m_index_[i] = input.readChar();
|
||||
}
|
||||
m_data_ = m_index_;
|
||||
m_initialValue_ = m_data_[m_dataOffset_];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the surrogate pair points to.
|
||||
* @param lead lead surrogate
|
||||
* @param trail trailing surrogate
|
||||
* @return offset to data
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getSurrogateOffset(char lead, char trail)
|
||||
{
|
||||
if (m_dataManipulate_ == null) {
|
||||
throw new NullPointerException(
|
||||
"The field DataManipulate in this Trie is null");
|
||||
}
|
||||
|
||||
// get fold position for the next trail surrogate
|
||||
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
|
||||
|
||||
// get the real data from the folded lead/trail units
|
||||
if (offset > 0) {
|
||||
return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
|
||||
}
|
||||
|
||||
// return -1 if there is an error, in this case we return the default
|
||||
// value: m_initialValue_
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value at the argument index.
|
||||
* For use internally in TrieIterator.
|
||||
* @param index value at index will be retrieved
|
||||
* @return 32 bit value
|
||||
* @see com.ibm.icu.impl.TrieIterator
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getValue(int index)
|
||||
{
|
||||
return m_data_[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the default initial value
|
||||
* @return 32 bit value
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getInitialValue()
|
||||
{
|
||||
return m_initialValue_;
|
||||
}
|
||||
|
||||
// private data members --------------------------------------------
|
||||
|
||||
/**
|
||||
* Default value
|
||||
*/
|
||||
private char m_initialValue_;
|
||||
/**
|
||||
* Array of char data
|
||||
*/
|
||||
private char m_data_[];
|
||||
/**
|
||||
* Agent for friends
|
||||
*/
|
||||
private FriendAgent m_friendAgent_;
|
||||
}
|
||||
146
jdkSrc/jdk8/sun/text/normalizer/CharacterIteratorWrapper.java
Normal file
146
jdkSrc/jdk8/sun/text/normalizer/CharacterIteratorWrapper.java
Normal file
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* This class is a wrapper around CharacterIterator and implements the
|
||||
* UCharacterIterator protocol
|
||||
* @author ram
|
||||
*/
|
||||
|
||||
public class CharacterIteratorWrapper extends UCharacterIterator {
|
||||
|
||||
private CharacterIterator iterator;
|
||||
|
||||
public CharacterIteratorWrapper(CharacterIterator iter){
|
||||
if(iter==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
iterator = iter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#current()
|
||||
*/
|
||||
public int current() {
|
||||
int c = iterator.current();
|
||||
if(c==CharacterIterator.DONE){
|
||||
return DONE;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getLength()
|
||||
*/
|
||||
public int getLength() {
|
||||
return (iterator.getEndIndex() - iterator.getBeginIndex());
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getIndex()
|
||||
*/
|
||||
public int getIndex() {
|
||||
return iterator.getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#next()
|
||||
*/
|
||||
public int next() {
|
||||
int i = iterator.current();
|
||||
iterator.next();
|
||||
if(i==CharacterIterator.DONE){
|
||||
return DONE;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#previous()
|
||||
*/
|
||||
public int previous() {
|
||||
int i = iterator.previous();
|
||||
if(i==CharacterIterator.DONE){
|
||||
return DONE;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#setIndex(int)
|
||||
*/
|
||||
public void setIndex(int index) {
|
||||
iterator.setIndex(index);
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* @see UCharacterIterator#getText(char[])
|
||||
*/
|
||||
public int getText(char[] fillIn, int offset){
|
||||
int length =iterator.getEndIndex() - iterator.getBeginIndex();
|
||||
int currentIndex = iterator.getIndex();
|
||||
if(offset < 0 || offset + length > fillIn.length){
|
||||
throw new IndexOutOfBoundsException(Integer.toString(length));
|
||||
}
|
||||
|
||||
for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) {
|
||||
fillIn[offset++] = ch;
|
||||
}
|
||||
iterator.setIndex(currentIndex);
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a clone of this iterator. Clones the underlying character iterator.
|
||||
* @see UCharacterIterator#clone()
|
||||
*/
|
||||
public Object clone(){
|
||||
try {
|
||||
CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone();
|
||||
result.iterator = (CharacterIterator)this.iterator.clone();
|
||||
return result;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // only invoked if bad underlying character iterator
|
||||
}
|
||||
}
|
||||
}
|
||||
189
jdkSrc/jdk8/sun/text/normalizer/ICUBinary.java
Normal file
189
jdkSrc/jdk8/sun/text/normalizer/ICUBinary.java
Normal file
@@ -0,0 +1,189 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class ICUBinary
|
||||
{
|
||||
// public inner interface ------------------------------------------------
|
||||
|
||||
/**
|
||||
* Special interface for data authentication
|
||||
*/
|
||||
public static interface Authenticate
|
||||
{
|
||||
/**
|
||||
* Method used in ICUBinary.readHeader() to provide data format
|
||||
* authentication.
|
||||
* @param version version of the current data
|
||||
* @return true if dataformat is an acceptable version, false otherwise
|
||||
*/
|
||||
public boolean isDataVersionAcceptable(byte version[]);
|
||||
}
|
||||
|
||||
// public methods --------------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>ICU data header reader method.
|
||||
* Takes a ICU generated big-endian input stream, parse the ICU standard
|
||||
* file header and authenticates them.</p>
|
||||
* <p>Header format:
|
||||
* <ul>
|
||||
* <li> Header size (char)
|
||||
* <li> Magic number 1 (byte)
|
||||
* <li> Magic number 2 (byte)
|
||||
* <li> Rest of the header size (char)
|
||||
* <li> Reserved word (char)
|
||||
* <li> Big endian indicator (byte)
|
||||
* <li> Character set family indicator (byte)
|
||||
* <li> Size of a char (byte) for c++ and c use
|
||||
* <li> Reserved byte (byte)
|
||||
* <li> Data format identifier (4 bytes), each ICU data has its own
|
||||
* identifier to distinguish them. [0] major [1] minor
|
||||
* [2] milli [3] micro
|
||||
* <li> Data version (4 bytes), the change version of the ICU data
|
||||
* [0] major [1] minor [2] milli [3] micro
|
||||
* <li> Unicode version (4 bytes) this ICU is based on.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* Example of use:<br>
|
||||
* <pre>
|
||||
* try {
|
||||
* FileInputStream input = new FileInputStream(filename);
|
||||
* If (Utility.readICUDataHeader(input, dataformat, dataversion,
|
||||
* unicode) {
|
||||
* System.out.println("Verified file header, this is a ICU data file");
|
||||
* }
|
||||
* } catch (IOException e) {
|
||||
* System.out.println("This is not a ICU data file");
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* @param inputStream input stream that contains the ICU data header
|
||||
* @param dataFormatIDExpected Data format expected. An array of 4 bytes
|
||||
* information about the data format.
|
||||
* E.g. data format ID 1.2.3.4. will became an array of
|
||||
* {1, 2, 3, 4}
|
||||
* @param authenticate user defined extra data authentication. This value
|
||||
* can be null, if no extra authentication is needed.
|
||||
* @exception IOException thrown if there is a read error or
|
||||
* when header authentication fails.
|
||||
* @draft 2.1
|
||||
*/
|
||||
public static final byte[] readHeader(InputStream inputStream,
|
||||
byte dataFormatIDExpected[],
|
||||
Authenticate authenticate)
|
||||
throws IOException
|
||||
{
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
char headersize = input.readChar();
|
||||
int readcount = 2;
|
||||
//reading the header format
|
||||
byte magic1 = input.readByte();
|
||||
readcount ++;
|
||||
byte magic2 = input.readByte();
|
||||
readcount ++;
|
||||
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
|
||||
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
|
||||
}
|
||||
|
||||
input.readChar(); // reading size
|
||||
readcount += 2;
|
||||
input.readChar(); // reading reserved word
|
||||
readcount += 2;
|
||||
byte bigendian = input.readByte();
|
||||
readcount ++;
|
||||
byte charset = input.readByte();
|
||||
readcount ++;
|
||||
byte charsize = input.readByte();
|
||||
readcount ++;
|
||||
input.readByte(); // reading reserved byte
|
||||
readcount ++;
|
||||
|
||||
byte dataFormatID[] = new byte[4];
|
||||
input.readFully(dataFormatID);
|
||||
readcount += 4;
|
||||
byte dataVersion[] = new byte[4];
|
||||
input.readFully(dataVersion);
|
||||
readcount += 4;
|
||||
byte unicodeVersion[] = new byte[4];
|
||||
input.readFully(unicodeVersion);
|
||||
readcount += 4;
|
||||
if (headersize < readcount) {
|
||||
throw new IOException("Internal Error: Header size error");
|
||||
}
|
||||
input.skipBytes(headersize - readcount);
|
||||
|
||||
if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_
|
||||
|| charsize != CHAR_SIZE_
|
||||
|| !Arrays.equals(dataFormatIDExpected, dataFormatID)
|
||||
|| (authenticate != null
|
||||
&& !authenticate.isDataVersionAcceptable(dataVersion))) {
|
||||
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
|
||||
}
|
||||
return unicodeVersion;
|
||||
}
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Magic numbers to authenticate the data file
|
||||
*/
|
||||
private static final byte MAGIC1 = (byte)0xda;
|
||||
private static final byte MAGIC2 = (byte)0x27;
|
||||
|
||||
/**
|
||||
* File format authentication values
|
||||
*/
|
||||
private static final byte BIG_ENDIAN_ = 1;
|
||||
private static final byte CHAR_SET_ = 0;
|
||||
private static final byte CHAR_SIZE_ = 2;
|
||||
|
||||
/**
|
||||
* Error messages
|
||||
*/
|
||||
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ =
|
||||
"ICU data file error: Not an ICU data file";
|
||||
private static final String HEADER_AUTHENTICATION_FAILED_ =
|
||||
"ICU data file error: Header authentication failed, please check if you have a valid ICU data file";
|
||||
}
|
||||
83
jdkSrc/jdk8/sun/text/normalizer/ICUData.java
Normal file
83
jdkSrc/jdk8/sun/text/normalizer/ICUData.java
Normal file
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.security.AccessController;
|
||||
import java.security.PrivilegedAction;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
/**
|
||||
* Provides access to ICU data files as InputStreams. Implements security checking.
|
||||
*/
|
||||
public final class ICUData {
|
||||
|
||||
private static InputStream getStream(final Class<ICUData> root, final String resourceName, boolean required) {
|
||||
InputStream i = null;
|
||||
|
||||
if (System.getSecurityManager() != null) {
|
||||
i = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
|
||||
public InputStream run() {
|
||||
return root.getResourceAsStream(resourceName);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
i = root.getResourceAsStream(resourceName);
|
||||
}
|
||||
|
||||
if (i == null && required) {
|
||||
throw new MissingResourceException("could not locate data", root.getPackage().getName(), resourceName);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convenience override that calls getStream(ICUData.class, resourceName, false);
|
||||
*/
|
||||
public static InputStream getStream(String resourceName) {
|
||||
return getStream(ICUData.class, resourceName, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convenience method that calls getStream(ICUData.class, resourceName, true).
|
||||
*/
|
||||
public static InputStream getRequiredStream(String resourceName) {
|
||||
return getStream(ICUData.class, resourceName, true);
|
||||
}
|
||||
}
|
||||
229
jdkSrc/jdk8/sun/text/normalizer/IntTrie.java
Normal file
229
jdkSrc/jdk8/sun/text/normalizer/IntTrie.java
Normal file
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Trie implementation which stores data in int, 32 bits.
|
||||
* @author synwee
|
||||
* @see com.ibm.icu.impl.Trie
|
||||
* @since release 2.1, Jan 01 2002
|
||||
*/
|
||||
public class IntTrie extends Trie
|
||||
{
|
||||
// public constructors ---------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Creates a new Trie with the settings for the trie data.</p>
|
||||
* <p>Unserialize the 32-bit-aligned input stream and use the data for the
|
||||
* trie.</p>
|
||||
* @param inputStream file input stream to a ICU data file, containing
|
||||
* the trie
|
||||
* @param dataManipulate object which provides methods to parse the char
|
||||
* data
|
||||
* @throws IOException thrown when data reading fails
|
||||
* @draft 2.1
|
||||
*/
|
||||
public IntTrie(InputStream inputStream, DataManipulate datamanipulate)
|
||||
throws IOException
|
||||
{
|
||||
super(inputStream, datamanipulate);
|
||||
if (!isIntTrie()) {
|
||||
throw new IllegalArgumentException(
|
||||
"Data given does not belong to a int trie.");
|
||||
}
|
||||
}
|
||||
|
||||
// public methods --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the value associated with the codepoint.
|
||||
* If no value is associated with the codepoint, a default value will be
|
||||
* returned.
|
||||
* @param ch codepoint
|
||||
* @return offset to data
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final int getCodePointValue(int ch)
|
||||
{
|
||||
int offset = getCodePointOffset(ch);
|
||||
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value to the data which this lead surrogate character points
|
||||
* to.
|
||||
* Returned data may contain folding offset information for the next
|
||||
* trailing surrogate character.
|
||||
* This method does not guarantee correct results for trail surrogates.
|
||||
* @param ch lead surrogate character
|
||||
* @return data value
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final int getLeadValue(char ch)
|
||||
{
|
||||
return m_data_[getLeadOffset(ch)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a value from a folding offset (from the value of a lead surrogate)
|
||||
* and a trail surrogate.
|
||||
* @param leadvalue the value of a lead surrogate that contains the
|
||||
* folding offset
|
||||
* @param trail surrogate
|
||||
* @return trie data value associated with the trail character
|
||||
* @draft 2.1
|
||||
*/
|
||||
public final int getTrailValue(int leadvalue, char trail)
|
||||
{
|
||||
if (m_dataManipulate_ == null) {
|
||||
throw new NullPointerException(
|
||||
"The field DataManipulate in this Trie is null");
|
||||
}
|
||||
int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
|
||||
if (offset > 0) {
|
||||
return m_data_[getRawOffset(offset,
|
||||
(char)(trail & SURROGATE_MASK_))];
|
||||
}
|
||||
return m_initialValue_;
|
||||
}
|
||||
|
||||
// protected methods -----------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Parses the input stream and stores its trie content into a index and
|
||||
* data array</p>
|
||||
* @param inputStream data input stream containing trie data
|
||||
* @exception IOException thrown when data reading fails
|
||||
*/
|
||||
protected final void unserialize(InputStream inputStream)
|
||||
throws IOException
|
||||
{
|
||||
super.unserialize(inputStream);
|
||||
// one used for initial value
|
||||
m_data_ = new int[m_dataLength_];
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
for (int i = 0; i < m_dataLength_; i ++) {
|
||||
m_data_[i] = input.readInt();
|
||||
}
|
||||
m_initialValue_ = m_data_[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the surrogate pair points to.
|
||||
* @param lead lead surrogate
|
||||
* @param trail trailing surrogate
|
||||
* @return offset to data
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getSurrogateOffset(char lead, char trail)
|
||||
{
|
||||
if (m_dataManipulate_ == null) {
|
||||
throw new NullPointerException(
|
||||
"The field DataManipulate in this Trie is null");
|
||||
}
|
||||
// get fold position for the next trail surrogate
|
||||
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
|
||||
|
||||
// get the real data from the folded lead/trail units
|
||||
if (offset > 0) {
|
||||
return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
|
||||
}
|
||||
|
||||
// return -1 if there is an error, in this case we return the default
|
||||
// value: m_initialValue_
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the value at the argument index.
|
||||
* For use internally in TrieIterator
|
||||
* @param index value at index will be retrieved
|
||||
* @return 32 bit value
|
||||
* @see com.ibm.icu.impl.TrieIterator
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getValue(int index)
|
||||
{
|
||||
return m_data_[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the default initial value
|
||||
* @return 32 bit value
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected final int getInitialValue()
|
||||
{
|
||||
return m_initialValue_;
|
||||
}
|
||||
|
||||
// package private methods -----------------------------------------
|
||||
|
||||
/**
|
||||
* Internal constructor for builder use
|
||||
* @param index the index array to be slotted into this trie
|
||||
* @param data the data array to be slotted into this trie
|
||||
* @param initialvalue the initial value for this trie
|
||||
* @param options trie options to use
|
||||
* @param datamanipulate folding implementation
|
||||
*/
|
||||
IntTrie(char index[], int data[], int initialvalue, int options,
|
||||
DataManipulate datamanipulate)
|
||||
{
|
||||
super(index, options, datamanipulate);
|
||||
m_data_ = data;
|
||||
m_dataLength_ = m_data_.length;
|
||||
m_initialValue_ = initialvalue;
|
||||
}
|
||||
|
||||
// private data members --------------------------------------------
|
||||
|
||||
/**
|
||||
* Default value
|
||||
*/
|
||||
private int m_initialValue_;
|
||||
/**
|
||||
* Array of char data
|
||||
*/
|
||||
private int m_data_[];
|
||||
}
|
||||
1683
jdkSrc/jdk8/sun/text/normalizer/NormalizerBase.java
Normal file
1683
jdkSrc/jdk8/sun/text/normalizer/NormalizerBase.java
Normal file
File diff suppressed because it is too large
Load Diff
389
jdkSrc/jdk8/sun/text/normalizer/NormalizerDataReader.java
Normal file
389
jdkSrc/jdk8/sun/text/normalizer/NormalizerDataReader.java
Normal file
@@ -0,0 +1,389 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author Ram Viswanadha
|
||||
*/
|
||||
|
||||
/*
|
||||
* Description of the format of unorm.icu version 2.1.
|
||||
*
|
||||
* Main change from version 1 to version 2:
|
||||
* Use of new, common Trie instead of normalization-specific tries.
|
||||
* Change to version 2.1: add third/auxiliary trie with associated data.
|
||||
*
|
||||
* For more details of how to use the data structures see the code
|
||||
* in unorm.cpp (runtime normalization code) and
|
||||
* in gennorm.c and gennorm/store.c (build-time data generation).
|
||||
*
|
||||
* For the serialized format of Trie see Trie.c/TrieHeader.
|
||||
*
|
||||
* - Overall partition
|
||||
*
|
||||
* unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
|
||||
* After that there are the following structures:
|
||||
*
|
||||
* char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
|
||||
*
|
||||
* Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
|
||||
*
|
||||
* char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
|
||||
* extraData[0] contains the number of units for
|
||||
* FC_NFKC_Closure (formatVersion>=2.1)
|
||||
*
|
||||
* char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
|
||||
* combiningTableTop may include one 16-bit padding unit
|
||||
* to make sure that fcdTrie is 32-bit-aligned
|
||||
*
|
||||
* Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
|
||||
*
|
||||
* Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
|
||||
*
|
||||
*
|
||||
* The indexes array contains lengths and sizes of the following arrays and structures
|
||||
* as well as the following values:
|
||||
* indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
|
||||
* -- one more than the highest combining index computed for forward-only-combining characters
|
||||
* indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
|
||||
* -- number of combining indexes computed for both-ways-combining characters
|
||||
* indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
|
||||
* -- number of combining indexes computed for backward-only-combining characters
|
||||
*
|
||||
* indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
|
||||
* -- first code point with a quick check NF* value of NO/MAYBE
|
||||
*
|
||||
*
|
||||
* - Tries
|
||||
*
|
||||
* The main structures are two Trie tables ("compact arrays"),
|
||||
* each with one index array and one data array.
|
||||
* See Trie.h and Trie.c.
|
||||
*
|
||||
*
|
||||
* - Tries in unorm.icu
|
||||
*
|
||||
* The first trie (normTrie above)
|
||||
* provides data for the NF* quick checks and normalization.
|
||||
* The second trie (fcdTrie above) provides data just for FCD checks.
|
||||
*
|
||||
*
|
||||
* - norm32 data words from the first trie
|
||||
*
|
||||
* The norm32Table contains one 32-bit word "norm32" per code point.
|
||||
* It contains the following bit fields:
|
||||
* 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
|
||||
* if this index is <EXTRA_INDEX_TOP then it is an index into
|
||||
* extraData[] where variable-length normalization data for this
|
||||
* code point is found
|
||||
* if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
|
||||
* then this is a norm32 for a leading surrogate, and the index
|
||||
* value is used together with the following trailing surrogate
|
||||
* code unit in the second trie access
|
||||
* if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
|
||||
* then this is a norm32 for a "special" character,
|
||||
* i.e., the character is a Hangul syllable or a Jamo
|
||||
* see EXTRA_HANGUL etc.
|
||||
* generally, instead of extracting this index from the norm32 and
|
||||
* comparing it with the above constants,
|
||||
* the normalization code compares the entire norm32 value
|
||||
* with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
|
||||
*
|
||||
* 15..8 combining class (cc) according to UnicodeData.txt
|
||||
*
|
||||
* 7..6 COMBINES_ANY flags, used in composition to see if a character
|
||||
* combines with any following or preceding character(s)
|
||||
* at all
|
||||
* 7 COMBINES_BACK
|
||||
* 6 COMBINES_FWD
|
||||
*
|
||||
* 5..0 quick check flags, set for "no" or "maybe", with separate flags for
|
||||
* each normalization form
|
||||
* the higher bits are "maybe" flags; for NF*D there are no such flags
|
||||
* the lower bits are "no" flags for all forms, in the same order
|
||||
* as the "maybe" flags,
|
||||
* which is (MSB to LSB): NFKD NFD NFKC NFC
|
||||
* 5..4 QC_ANY_MAYBE
|
||||
* 3..0 QC_ANY_NO
|
||||
* see further related constants
|
||||
*
|
||||
*
|
||||
* - Extra data per code point
|
||||
*
|
||||
* "Extra data" is referenced by the index in norm32.
|
||||
* It is variable-length data. It is only present, and only those parts
|
||||
* of it are, as needed for a given character.
|
||||
* The norm32 extra data index is added to the beginning of extraData[]
|
||||
* to get to a vector of 16-bit words with data at the following offsets:
|
||||
*
|
||||
* [-1] Combining index for composition.
|
||||
* Stored only if norm32&COMBINES_ANY .
|
||||
* [0] Lengths of the canonical and compatibility decomposition strings.
|
||||
* Stored only if there are decompositions, i.e.,
|
||||
* if norm32&(QC_NFD|QC_NFKD)
|
||||
* High byte: length of NFKD, or 0 if none
|
||||
* Low byte: length of NFD, or 0 if none
|
||||
* Each length byte also has another flag:
|
||||
* Bit 7 of a length byte is set if there are non-zero
|
||||
* combining classes (cc's) associated with the respective
|
||||
* decomposition. If this flag is set, then the decomposition
|
||||
* is preceded by a 16-bit word that contains the
|
||||
* leading and trailing cc's.
|
||||
* Bits 6..0 of a length byte are the length of the
|
||||
* decomposition string, not counting the cc word.
|
||||
* [1..n] NFD
|
||||
* [n+1..] NFKD
|
||||
*
|
||||
* Each of the two decompositions consists of up to two parts:
|
||||
* - The 16-bit words with the leading and trailing cc's.
|
||||
* This is only stored if bit 7 of the corresponding length byte
|
||||
* is set. In this case, at least one of the cc's is not zero.
|
||||
* High byte: leading cc==cc of the first code point in the decomposition string
|
||||
* Low byte: trailing cc==cc of the last code point in the decomposition string
|
||||
* - The decomposition string in UTF-16, with length code units.
|
||||
*
|
||||
*
|
||||
* - Combining indexes and combiningTable[]
|
||||
*
|
||||
* Combining indexes are stored at the [-1] offset of the extra data
|
||||
* if the character combines forward or backward with any other characters.
|
||||
* They are used for (re)composition in NF*C.
|
||||
* Values of combining indexes are arranged according to whether a character
|
||||
* combines forward, backward, or both ways:
|
||||
* forward-only < both ways < backward-only
|
||||
*
|
||||
* The index values for forward-only and both-ways combining characters
|
||||
* are indexes into the combiningTable[].
|
||||
* The index values for backward-only combining characters are simply
|
||||
* incremented from the preceding index values to be unique.
|
||||
*
|
||||
* In the combiningTable[], a variable-length list
|
||||
* of variable-length (back-index, code point) pair entries is stored
|
||||
* for each forward-combining character.
|
||||
*
|
||||
* These back-indexes are the combining indexes of both-ways or backward-only
|
||||
* combining characters that the forward-combining character combines with.
|
||||
*
|
||||
* Each list is sorted in ascending order of back-indexes.
|
||||
* Each list is terminated with the last back-index having bit 15 set.
|
||||
*
|
||||
* Each pair (back-index, code point) takes up either 2 or 3
|
||||
* 16-bit words.
|
||||
* The first word of a list entry is the back-index, with its bit 15 set if
|
||||
* this is the last pair in the list.
|
||||
*
|
||||
* The second word contains flags in bits 15..13 that determine
|
||||
* if there is a third word and how the combined character is encoded:
|
||||
* 15 set if there is a third word in this list entry
|
||||
* 14 set if the result is a supplementary character
|
||||
* 13 set if the result itself combines forward
|
||||
*
|
||||
* According to these bits 15..14 of the second word,
|
||||
* the result character is encoded as follows:
|
||||
* 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
|
||||
* the second word.
|
||||
* 10 The result is 0x2000..0xffff and stored in the third word.
|
||||
* Bits 12..0 of the second word are not used.
|
||||
* 11 The result is a supplementary character.
|
||||
* Bits 9..0 of the leading surrogate are in bits 9..0 of
|
||||
* the second word.
|
||||
* Add 0xd800 to these bits to get the complete surrogate.
|
||||
* Bits 12..10 of the second word are not used.
|
||||
* The trailing surrogate is stored in the third word.
|
||||
*
|
||||
*
|
||||
* - FCD trie
|
||||
*
|
||||
* The FCD trie is very simple.
|
||||
* It is a folded trie with 16-bit data words.
|
||||
* In each word, the high byte contains the leading cc of the character,
|
||||
* and the low byte contains the trailing cc of the character.
|
||||
* These cc's are the cc's of the first and last code points in the
|
||||
* canonical decomposition of the character.
|
||||
*
|
||||
* Since all 16 bits are used for cc's, lead surrogates must be tested
|
||||
* by checking the code unit instead of the trie data.
|
||||
* This is done only if the 16-bit data word is not zero.
|
||||
* If the code unit is a leading surrogate and the data word is not zero,
|
||||
* then instead of cc's it contains the offset for the second trie lookup.
|
||||
*
|
||||
*
|
||||
* - Auxiliary trie and data
|
||||
*
|
||||
*
|
||||
* The auxiliary 16-bit trie contains data for additional properties.
|
||||
* Bits
|
||||
* 15..13 reserved
|
||||
* 12 not NFC_Skippable (f) (formatVersion>=2.2)
|
||||
* 11 flag: not a safe starter for canonical closure
|
||||
* 10 composition exclusion
|
||||
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
|
||||
* (not for lead surrogate),
|
||||
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
|
||||
*
|
||||
* Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
|
||||
* (used in NormalizerTransliterator)
|
||||
*
|
||||
* A skippable character is
|
||||
* a) unassigned, or ALL of the following:
|
||||
* b) of combining class 0.
|
||||
* c) not decomposed by this normalization form.
|
||||
* AND if NFC or NFKC,
|
||||
* d) can never compose with a previous character.
|
||||
* e) can never compose with a following character.
|
||||
* f) can never change if another character is added.
|
||||
* Example: a-breve might satisfy all but f, but if you
|
||||
* add an ogonek it changes to a-ogonek + breve
|
||||
*
|
||||
* a)..e) must be tested from norm32.
|
||||
* Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
|
||||
* into the auxiliary trie.
|
||||
* The same bit is used for NFC and NFKC; (c) differs for them.
|
||||
* As usual, we build the "not skippable" flags so that unassigned
|
||||
* code points get a 0 bit.
|
||||
* This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
|
||||
* Test Hangul LV syllables entirely in code.
|
||||
*
|
||||
*
|
||||
* - FC_NFKC_Closure strings in extraData[]
|
||||
*
|
||||
* Strings are either stored as a single code unit or as the length
|
||||
* followed by that many units.
|
||||
*
|
||||
*/
|
||||
final class NormalizerDataReader implements ICUBinary.Authenticate {
|
||||
|
||||
/**
|
||||
* <p>Protected constructor.</p>
|
||||
* @param inputStream ICU uprop.dat file input stream
|
||||
* @exception IOException throw if data file fails authentication
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected NormalizerDataReader(InputStream inputStream)
|
||||
throws IOException{
|
||||
|
||||
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
|
||||
dataInputStream = new DataInputStream(inputStream);
|
||||
}
|
||||
|
||||
// protected methods -------------------------------------------------
|
||||
|
||||
protected int[] readIndexes(int length)throws IOException{
|
||||
int[] indexes = new int[length];
|
||||
//Read the indexes
|
||||
for (int i = 0; i <length ; i++) {
|
||||
indexes[i] = dataInputStream.readInt();
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
/**
|
||||
* <p>Reads unorm.icu, parse it into blocks of data to be stored in
|
||||
* NormalizerImpl.</P
|
||||
* @param normBytes
|
||||
* @param fcdBytes
|
||||
* @param auxBytes
|
||||
* @param extraData
|
||||
* @param combiningTable
|
||||
* @exception thrown when data reading fails
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
|
||||
char[] extraData, char[] combiningTable)
|
||||
throws IOException{
|
||||
|
||||
//Read the bytes that make up the normTrie
|
||||
dataInputStream.readFully(normBytes);
|
||||
|
||||
//normTrieStream= new ByteArrayInputStream(normBytes);
|
||||
|
||||
//Read the extra data
|
||||
for(int i=0;i<extraData.length;i++){
|
||||
extraData[i]=dataInputStream.readChar();
|
||||
}
|
||||
|
||||
//Read the combining class table
|
||||
for(int i=0; i<combiningTable.length; i++){
|
||||
combiningTable[i]=dataInputStream.readChar();
|
||||
}
|
||||
|
||||
//Read the fcdTrie
|
||||
dataInputStream.readFully(fcdBytes);
|
||||
|
||||
|
||||
//Read the AuxTrie
|
||||
dataInputStream.readFully(auxBytes);
|
||||
}
|
||||
|
||||
public byte[] getDataFormatVersion(){
|
||||
return DATA_FORMAT_VERSION;
|
||||
}
|
||||
|
||||
public boolean isDataVersionAcceptable(byte version[])
|
||||
{
|
||||
return version[0] == DATA_FORMAT_VERSION[0]
|
||||
&& version[2] == DATA_FORMAT_VERSION[2]
|
||||
&& version[3] == DATA_FORMAT_VERSION[3];
|
||||
}
|
||||
|
||||
public byte[] getUnicodeVersion(){
|
||||
return unicodeVersion;
|
||||
}
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* ICU data file input stream
|
||||
*/
|
||||
private DataInputStream dataInputStream;
|
||||
|
||||
private byte[] unicodeVersion;
|
||||
|
||||
/**
|
||||
* File format version that this class understands.
|
||||
* No guarantees are made if a older version is used
|
||||
* see store.c of gennorm for more information and values
|
||||
*/
|
||||
private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
|
||||
(byte)0x72, (byte)0x6D};
|
||||
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
|
||||
(byte)0x5, (byte)0x2};
|
||||
|
||||
}
|
||||
2736
jdkSrc/jdk8/sun/text/normalizer/NormalizerImpl.java
Normal file
2736
jdkSrc/jdk8/sun/text/normalizer/NormalizerImpl.java
Normal file
File diff suppressed because it is too large
Load Diff
140
jdkSrc/jdk8/sun/text/normalizer/RangeValueIterator.java
Normal file
140
jdkSrc/jdk8/sun/text/normalizer/RangeValueIterator.java
Normal file
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* <p>Interface for enabling iteration over sets of <int index, int value>,
|
||||
* where index is the sorted integer index in ascending order and value, its
|
||||
* associated integer value.</p>
|
||||
* <p>The result for each iteration is the consecutive range of
|
||||
* <int index, int value> with the same value. Result is represented by
|
||||
* <start, limit, value> where</p>
|
||||
* <ul>
|
||||
* <li> start is the starting integer of the result range
|
||||
* <li> limit is 1 after the maximum integer that follows start, such that
|
||||
* all integers between start and (limit - 1), inclusive, have the same
|
||||
* associated integer value.
|
||||
* <li> value is the integer value that all integers from start to (limit - 1)
|
||||
* share in common.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Hence value(start) = value(start + 1) = .... = value(start + n) = .... =
|
||||
* value(limit - 1). However value(start -1) != value(start) and
|
||||
* value(limit) != value(start).
|
||||
* </p>
|
||||
* <p>Most implementations will be created by factory methods, such as the
|
||||
* character type iterator in UCharacter.getTypeIterator. See example below.
|
||||
* </p>
|
||||
* Example of use:<br>
|
||||
* <pre>
|
||||
* RangeValueIterator iterator = UCharacter.getTypeIterator();
|
||||
* RangeValueIterator.Element result = new RangeValueIterator.Element();
|
||||
* while (iterator.next(result)) {
|
||||
* System.out.println("Codepoint \\u" +
|
||||
* Integer.toHexString(result.start) +
|
||||
* " to codepoint \\u" +
|
||||
* Integer.toHexString(result.limit - 1) +
|
||||
* " has the character type " + result.value);
|
||||
* }
|
||||
* </pre>
|
||||
* @author synwee
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public interface RangeValueIterator
|
||||
{
|
||||
// public inner class ---------------------------------------------
|
||||
|
||||
/**
|
||||
* Return result wrapper for com.ibm.icu.util.RangeValueIterator.
|
||||
* Stores the start and limit of the continous result range and the
|
||||
* common value all integers between [start, limit - 1] has.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public class Element
|
||||
{
|
||||
// public data member ---------------------------------------------
|
||||
|
||||
/**
|
||||
* Starting integer of the continuous result range that has the same
|
||||
* value
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public int start;
|
||||
/**
|
||||
* (End + 1) integer of continuous result range that has the same
|
||||
* value
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public int limit;
|
||||
/**
|
||||
* Gets the common value of the continous result range
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public int value;
|
||||
|
||||
// public constructor --------------------------------------------
|
||||
|
||||
/**
|
||||
* Empty default constructor to make javadoc happy
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public Element()
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
// public methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Gets the next maximal result range with a common value and returns
|
||||
* true if we are not at the end of the iteration, false otherwise.</p>
|
||||
* <p>If the return boolean is a false, the contents of elements will not
|
||||
* be updated.</p>
|
||||
* @param element for storing the result range and value
|
||||
* @return true if we are not at the end of the iteration, false otherwise.
|
||||
* @see Element
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public boolean next(Element element);
|
||||
|
||||
/**
|
||||
* Resets the iterator to the beginning of the iteration.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
123
jdkSrc/jdk8/sun/text/normalizer/Replaceable.java
Normal file
123
jdkSrc/jdk8/sun/text/normalizer/Replaceable.java
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* <code>Replaceable</code> is an interface representing a
|
||||
* string of characters that supports the replacement of a range of
|
||||
* itself with a new string of characters. It is used by APIs that
|
||||
* change a piece of text while retaining metadata. Metadata is data
|
||||
* other than the Unicode characters returned by char32At(). One
|
||||
* example of metadata is style attributes; another is an edit
|
||||
* history, marking each character with an author and revision number.
|
||||
*
|
||||
* <p>An implicit aspect of the <code>Replaceable</code> API is that
|
||||
* during a replace operation, new characters take on the metadata of
|
||||
* the old characters. For example, if the string "the <b>bold</b>
|
||||
* font" has range (4, 8) replaced with "strong", then it becomes "the
|
||||
* <b>strong</b> font".
|
||||
*
|
||||
* <p><code>Replaceable</code> specifies ranges using a start
|
||||
* offset and a limit offset. The range of characters thus specified
|
||||
* includes the characters at offset start..limit-1. That is, the
|
||||
* start offset is inclusive, and the limit offset is exclusive.
|
||||
*
|
||||
* <p><code>Replaceable</code> also includes API to access characters
|
||||
* in the string: <code>length()</code>, <code>charAt()</code>,
|
||||
* <code>char32At()</code>, and <code>extractBetween()</code>.
|
||||
*
|
||||
* <p>For a subclass to support metadata, typical behavior of
|
||||
* <code>replace()</code> is the following:
|
||||
* <ul>
|
||||
* <li>Set the metadata of the new text to the metadata of the first
|
||||
* character replaced</li>
|
||||
* <li>If no characters are replaced, use the metadata of the
|
||||
* previous character</li>
|
||||
* <li>If there is no previous character (i.e. start == 0), use the
|
||||
* following character</li>
|
||||
* <li>If there is no following character (i.e. the replaceable was
|
||||
* empty), use default metadata<br>
|
||||
* <li>If the code point U+FFFF is seen, it should be interpreted as
|
||||
* a special marker having no metadata<li>
|
||||
* </li>
|
||||
* </ul>
|
||||
* If this is not the behavior, the subclass should document any differences.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public interface Replaceable {
|
||||
/**
|
||||
* Returns the number of 16-bit code units in the text.
|
||||
* @return number of 16-bit code units in text
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
int length();
|
||||
|
||||
/**
|
||||
* Returns the 16-bit code unit at the given offset into the text.
|
||||
* @param offset an integer between 0 and <code>length()</code>-1
|
||||
* inclusive
|
||||
* @return 16-bit code unit of text at given offset
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
char charAt(int offset);
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Copies characters from this object into the destination
|
||||
* character array. The first character to be copied is at index
|
||||
* <code>srcStart</code>; the last character to be copied is at
|
||||
* index <code>srcLimit-1</code> (thus the total number of
|
||||
* characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code>
|
||||
* starting at index <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive; <code>0
|
||||
* <= start <= limit</code>.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* <code>start <= limit <= length()</code>.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
|
||||
}
|
||||
123
jdkSrc/jdk8/sun/text/normalizer/ReplaceableString.java
Normal file
123
jdkSrc/jdk8/sun/text/normalizer/ReplaceableString.java
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* <code>ReplaceableString</code> is an adapter class that implements the
|
||||
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
|
||||
*
|
||||
* <p><em>Note:</em> This class does not support attributes and is not
|
||||
* intended for general use. Most clients will need to implement
|
||||
* {@link Replaceable} in their text representation class.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @see Replaceable
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class ReplaceableString implements Replaceable {
|
||||
|
||||
private StringBuffer buf;
|
||||
|
||||
/**
|
||||
* Construct a new object with the given initial contents.
|
||||
* @param str initial contents
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public ReplaceableString(String str) {
|
||||
buf = new StringBuffer(str);
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Construct a new object using <code>buf</code> for internal
|
||||
* storage. The contents of <code>buf</code> at the time of
|
||||
* construction are used as the initial contents. <em>Note!
|
||||
* Modifications to <code>buf</code> will modify this object, and
|
||||
* vice versa.</em>
|
||||
* @param buf object to be used as internal storage
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public ReplaceableString(StringBuffer buf) {
|
||||
this.buf = buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of characters contained in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int length() {
|
||||
return buf.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the character at the given position in this object.
|
||||
* <code>Replaceable</code> API.
|
||||
* @param offset offset into the contents, from 0 to
|
||||
* <code>length()</code> - 1
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public char charAt(int offset) {
|
||||
return buf.charAt(offset);
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Copies characters from this object into the destination
|
||||
* character array. The first character to be copied is at index
|
||||
* <code>srcStart</code>; the last character to be copied is at
|
||||
* index <code>srcLimit-1</code> (thus the total number of
|
||||
* characters to be copied is <code>srcLimit-srcStart</code>). The
|
||||
* characters are copied into the subarray of <code>dst</code>
|
||||
* starting at index <code>dstStart</code> and ending at index
|
||||
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
|
||||
*
|
||||
* @param srcStart the beginning index to copy, inclusive; <code>0
|
||||
* <= start <= limit</code>.
|
||||
* @param srcLimit the ending index to copy, exclusive;
|
||||
* <code>start <= limit <= length()</code>.
|
||||
* @param dst the destination array.
|
||||
* @param dstStart the start offset in the destination array.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
|
||||
Utility.getChars(buf, srcStart, srcLimit, dst, dstStart);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* DLF docs must define behavior when Replaceable is mutated underneath
|
||||
* the iterator.
|
||||
*
|
||||
* This and ICUCharacterIterator share some code, maybe they should share
|
||||
* an implementation, or the common state and implementation should be
|
||||
* moved up into UCharacterIterator.
|
||||
*
|
||||
* What are first, last, and getBeginIndex doing here?!?!?!
|
||||
*/
|
||||
public class ReplaceableUCharacterIterator extends UCharacterIterator {
|
||||
|
||||
// public constructor ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* @param str text which the iterator will be based on
|
||||
*/
|
||||
public ReplaceableUCharacterIterator(String str){
|
||||
if(str==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(str);
|
||||
this.currentIndex = 0;
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Public constructor
|
||||
* @param buf buffer of text on which the iterator will be based
|
||||
*/
|
||||
public ReplaceableUCharacterIterator(StringBuffer buf){
|
||||
if(buf==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(buf);
|
||||
this.currentIndex = 0;
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, does not clone the underlying
|
||||
* <code>Replaceable</code>object
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone(){
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // never invoked
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current UTF16 character.
|
||||
* @return current UTF16 character
|
||||
*/
|
||||
public int current(){
|
||||
if (currentIndex < replaceable.length()) {
|
||||
return replaceable.charAt(currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
* @return length of the text
|
||||
*/
|
||||
public int getLength(){
|
||||
return replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current currentIndex in text.
|
||||
* @return current currentIndex in text.
|
||||
*/
|
||||
public int getIndex(){
|
||||
return currentIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
|
||||
* If the resulting currentIndex is greater or equal to the text length, the
|
||||
* currentIndex is reset to the text length and a value of DONECODEPOINT is
|
||||
* returned.
|
||||
* @return next UTF16 character in text or DONE if the new currentIndex is off the
|
||||
* end of the text range.
|
||||
*/
|
||||
public int next(){
|
||||
if (currentIndex < replaceable.length()) {
|
||||
return replaceable.charAt(currentIndex++);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns previous UTF16 character and decrements the iterator's currentIndex by
|
||||
* 1.
|
||||
* If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a
|
||||
* value of DONECODEPOINT is returned.
|
||||
* @return next UTF16 character in text or DONE if the new currentIndex is off the
|
||||
* start of the text range.
|
||||
*/
|
||||
public int previous(){
|
||||
if (currentIndex > 0) {
|
||||
return replaceable.charAt(--currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Sets the currentIndex to the specified currentIndex in the text and returns that
|
||||
* single UTF16 character at currentIndex.
|
||||
* This assumes the text is stored as 16-bit code units.</p>
|
||||
* @param currentIndex the currentIndex within the text.
|
||||
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
|
||||
* supplied. i.e. currentIndex is out of bounds.
|
||||
* @return the character at the specified currentIndex or DONE if the specified
|
||||
* currentIndex is equal to the end of the text.
|
||||
*/
|
||||
public void setIndex(int currentIndex) {
|
||||
if (currentIndex < 0 || currentIndex > replaceable.length()) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.currentIndex = currentIndex;
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
public int getText(char[] fillIn, int offset){
|
||||
int length = replaceable.length();
|
||||
if(offset < 0 || offset + length > fillIn.length){
|
||||
throw new IndexOutOfBoundsException(Integer.toString(length));
|
||||
}
|
||||
replaceable.getChars(0,length,fillIn,offset);
|
||||
return length;
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Replaceable object
|
||||
*/
|
||||
private Replaceable replaceable;
|
||||
/**
|
||||
* Current currentIndex
|
||||
*/
|
||||
private int currentIndex;
|
||||
|
||||
}
|
||||
367
jdkSrc/jdk8/sun/text/normalizer/RuleCharacterIterator.java
Normal file
367
jdkSrc/jdk8/sun/text/normalizer/RuleCharacterIterator.java
Normal file
@@ -0,0 +1,367 @@
|
||||
/*
|
||||
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: September 23 2003
|
||||
* Since: ICU 2.8
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.text.ParsePosition;
|
||||
|
||||
/**
|
||||
* An iterator that returns 32-bit code points. This class is deliberately
|
||||
* <em>not</em> related to any of the JDK or ICU4J character iterator classes
|
||||
* in order to minimize complexity.
|
||||
* @author Alan Liu
|
||||
* @since ICU 2.8
|
||||
*/
|
||||
public class RuleCharacterIterator {
|
||||
|
||||
// TODO: Ideas for later. (Do not implement if not needed, lest the
|
||||
// code coverage numbers go down due to unused methods.)
|
||||
// 1. Add a copy constructor, equals() method, clone() method.
|
||||
// 2. Rather than return DONE, throw an exception if the end
|
||||
// is reached -- this is an alternate usage model, probably not useful.
|
||||
// 3. Return isEscaped from next(). If this happens,
|
||||
// don't keep an isEscaped member variable.
|
||||
|
||||
/**
|
||||
* Text being iterated.
|
||||
*/
|
||||
private String text;
|
||||
|
||||
/**
|
||||
* Position of iterator.
|
||||
*/
|
||||
private ParsePosition pos;
|
||||
|
||||
/**
|
||||
* Symbol table used to parse and dereference variables. May be null.
|
||||
*/
|
||||
private SymbolTable sym;
|
||||
|
||||
/**
|
||||
* Current variable expansion, or null if none.
|
||||
*/
|
||||
private char[] buf;
|
||||
|
||||
/**
|
||||
* Position within buf[]. Meaningless if buf == null.
|
||||
*/
|
||||
private int bufPos;
|
||||
|
||||
/**
|
||||
* Flag indicating whether the last character was parsed from an escape.
|
||||
*/
|
||||
private boolean isEscaped;
|
||||
|
||||
/**
|
||||
* Value returned when there are no more characters to iterate.
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
|
||||
/**
|
||||
* Bitmask option to enable parsing of variable names. If (options &
|
||||
* PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
|
||||
* its value. Variables are parsed using the SymbolTable API.
|
||||
*/
|
||||
public static final int PARSE_VARIABLES = 1;
|
||||
|
||||
/**
|
||||
* Bitmask option to enable parsing of escape sequences. If (options &
|
||||
* PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
|
||||
* to its value. Escapes are parsed using Utility.unescapeAt().
|
||||
*/
|
||||
public static final int PARSE_ESCAPES = 2;
|
||||
|
||||
/**
|
||||
* Bitmask option to enable skipping of whitespace. If (options &
|
||||
* SKIP_WHITESPACE) != 0, then whitespace characters will be silently
|
||||
* skipped, as if they were not present in the input. Whitespace
|
||||
* characters are defined by UCharacterProperty.isRuleWhiteSpace().
|
||||
*/
|
||||
public static final int SKIP_WHITESPACE = 4;
|
||||
|
||||
/**
|
||||
* Constructs an iterator over the given text, starting at the given
|
||||
* position.
|
||||
* @param text the text to be iterated
|
||||
* @param sym the symbol table, or null if there is none. If sym is null,
|
||||
* then variables will not be deferenced, even if the PARSE_VARIABLES
|
||||
* option is set.
|
||||
* @param pos upon input, the index of the next character to return. If a
|
||||
* variable has been dereferenced, then pos will <em>not</em> increment as
|
||||
* characters of the variable value are iterated.
|
||||
*/
|
||||
public RuleCharacterIterator(String text, SymbolTable sym,
|
||||
ParsePosition pos) {
|
||||
if (text == null || pos.getIndex() > text.length()) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.text = text;
|
||||
this.sym = sym;
|
||||
this.pos = pos;
|
||||
buf = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this iterator has no more characters to return.
|
||||
*/
|
||||
public boolean atEnd() {
|
||||
return buf == null && pos.getIndex() == text.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next character using the given options, or DONE if there
|
||||
* are no more characters, and advance the position to the next
|
||||
* character.
|
||||
* @param options one or more of the following options, bitwise-OR-ed
|
||||
* together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
|
||||
* @return the current 32-bit code point, or DONE
|
||||
*/
|
||||
public int next(int options) {
|
||||
int c = DONE;
|
||||
isEscaped = false;
|
||||
|
||||
for (;;) {
|
||||
c = _current();
|
||||
_advance(UTF16.getCharCount(c));
|
||||
|
||||
if (c == SymbolTable.SYMBOL_REF && buf == null &&
|
||||
(options & PARSE_VARIABLES) != 0 && sym != null) {
|
||||
String name = sym.parseReference(text, pos, text.length());
|
||||
// If name == null there was an isolated SYMBOL_REF;
|
||||
// return it. Caller must be prepared for this.
|
||||
if (name == null) {
|
||||
break;
|
||||
}
|
||||
bufPos = 0;
|
||||
buf = sym.lookup(name);
|
||||
if (buf == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"Undefined variable: " + name);
|
||||
}
|
||||
// Handle empty variable value
|
||||
if (buf.length == 0) {
|
||||
buf = null;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((options & SKIP_WHITESPACE) != 0 &&
|
||||
UCharacterProperty.isRuleWhiteSpace(c)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
|
||||
int offset[] = new int[] { 0 };
|
||||
c = Utility.unescapeAt(lookahead(), offset);
|
||||
jumpahead(offset[0]);
|
||||
isEscaped = true;
|
||||
if (c < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape");
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the last character returned by next() was
|
||||
* escaped. This will only be the case if the option passed in to
|
||||
* next() included PARSE_ESCAPED and the next character was an
|
||||
* escape sequence.
|
||||
*/
|
||||
public boolean isEscaped() {
|
||||
return isEscaped;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this iterator is currently within a variable expansion.
|
||||
*/
|
||||
public boolean inVariable() {
|
||||
return buf != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an object which, when later passed to setPos(), will
|
||||
* restore this iterator's position. Usage idiom:
|
||||
*
|
||||
* RuleCharacterIterator iterator = ...;
|
||||
* Object pos = iterator.getPos(null); // allocate position object
|
||||
* for (;;) {
|
||||
* pos = iterator.getPos(pos); // reuse position object
|
||||
* int c = iterator.next(...);
|
||||
* ...
|
||||
* }
|
||||
* iterator.setPos(pos);
|
||||
*
|
||||
* @param p a position object previously returned by getPos(),
|
||||
* or null. If not null, it will be updated and returned. If
|
||||
* null, a new position object will be allocated and returned.
|
||||
* @return a position object which may be passed to setPos(),
|
||||
* either `p,' or if `p' == null, a newly-allocated object
|
||||
*/
|
||||
public Object getPos(Object p) {
|
||||
if (p == null) {
|
||||
return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
|
||||
}
|
||||
Object[] a = (Object[]) p;
|
||||
a[0] = buf;
|
||||
int[] v = (int[]) a[1];
|
||||
v[0] = pos.getIndex();
|
||||
v[1] = bufPos;
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Restores this iterator to the position it had when getPos()
|
||||
* returned the given object.
|
||||
* @param p a position object previously returned by getPos()
|
||||
*/
|
||||
public void setPos(Object p) {
|
||||
Object[] a = (Object[]) p;
|
||||
buf = (char[]) a[0];
|
||||
int[] v = (int[]) a[1];
|
||||
pos.setIndex(v[0]);
|
||||
bufPos = v[1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Skips ahead past any ignored characters, as indicated by the given
|
||||
* options. This is useful in conjunction with the lookahead() method.
|
||||
*
|
||||
* Currently, this only has an effect for SKIP_WHITESPACE.
|
||||
* @param options one or more of the following options, bitwise-OR-ed
|
||||
* together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
|
||||
*/
|
||||
public void skipIgnored(int options) {
|
||||
if ((options & SKIP_WHITESPACE) != 0) {
|
||||
for (;;) {
|
||||
int a = _current();
|
||||
if (!UCharacterProperty.isRuleWhiteSpace(a)) break;
|
||||
_advance(UTF16.getCharCount(a));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string containing the remainder of the characters to be
|
||||
* returned by this iterator, without any option processing. If the
|
||||
* iterator is currently within a variable expansion, this will only
|
||||
* extend to the end of the variable expansion. This method is provided
|
||||
* so that iterators may interoperate with string-based APIs. The typical
|
||||
* sequence of calls is to call skipIgnored(), then call lookahead(), then
|
||||
* parse the string returned by lookahead(), then call jumpahead() to
|
||||
* resynchronize the iterator.
|
||||
* @return a string containing the characters to be returned by future
|
||||
* calls to next()
|
||||
*/
|
||||
public String lookahead() {
|
||||
if (buf != null) {
|
||||
return new String(buf, bufPos, buf.length - bufPos);
|
||||
} else {
|
||||
return text.substring(pos.getIndex());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the position by the given number of 16-bit code units.
|
||||
* This is useful in conjunction with the lookahead() method.
|
||||
* @param count the number of 16-bit code units to jump over
|
||||
*/
|
||||
public void jumpahead(int count) {
|
||||
if (count < 0) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (buf != null) {
|
||||
bufPos += count;
|
||||
if (bufPos > buf.length) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (bufPos == buf.length) {
|
||||
buf = null;
|
||||
}
|
||||
} else {
|
||||
int i = pos.getIndex() + count;
|
||||
pos.setIndex(i);
|
||||
if (i > text.length()) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current 32-bit code point without parsing escapes, parsing
|
||||
* variables, or skipping whitespace.
|
||||
* @return the current 32-bit code point
|
||||
*/
|
||||
private int _current() {
|
||||
if (buf != null) {
|
||||
return UTF16.charAt(buf, 0, buf.length, bufPos);
|
||||
} else {
|
||||
int i = pos.getIndex();
|
||||
return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances the position by the given amount.
|
||||
* @param count the number of 16-bit code units to advance past
|
||||
*/
|
||||
private void _advance(int count) {
|
||||
if (buf != null) {
|
||||
bufPos += count;
|
||||
if (bufPos == buf.length) {
|
||||
buf = null;
|
||||
}
|
||||
} else {
|
||||
pos.setIndex(pos.getIndex() + count);
|
||||
if (pos.getIndex() > text.length()) {
|
||||
pos.setIndex(text.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
124
jdkSrc/jdk8/sun/text/normalizer/SymbolTable.java
Normal file
124
jdkSrc/jdk8/sun/text/normalizer/SymbolTable.java
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.text.ParsePosition;
|
||||
|
||||
/**
|
||||
* An interface that defines both lookup protocol and parsing of
|
||||
* symbolic names.
|
||||
*
|
||||
* <p>A symbol table maintains two kinds of mappings. The first is
|
||||
* between symbolic names and their values. For example, if the
|
||||
* variable with the name "start" is set to the value "alpha"
|
||||
* (perhaps, though not necessarily, through an expression such as
|
||||
* "$start=alpha"), then the call lookup("start") will return the
|
||||
* char[] array ['a', 'l', 'p', 'h', 'a'].
|
||||
*
|
||||
* <p>The second kind of mapping is between character values and
|
||||
* UnicodeMatcher objects. This is used by RuleBasedTransliterator,
|
||||
* which uses characters in the private use area to represent objects
|
||||
* such as UnicodeSets. If U+E015 is mapped to the UnicodeSet [a-z],
|
||||
* then lookupMatcher(0xE015) will return the UnicodeSet [a-z].
|
||||
*
|
||||
* <p>Finally, a symbol table defines parsing behavior for symbolic
|
||||
* names. All symbolic names start with the SYMBOL_REF character.
|
||||
* When a parser encounters this character, it calls parseReference()
|
||||
* with the position immediately following the SYMBOL_REF. The symbol
|
||||
* table parses the name, if there is one, and returns it.
|
||||
*
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
public interface SymbolTable {
|
||||
|
||||
/**
|
||||
* The character preceding a symbol reference name.
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
static final char SYMBOL_REF = '$';
|
||||
|
||||
/**
|
||||
* Lookup the characters associated with this string and return it.
|
||||
* Return <tt>null</tt> if no such name exists. The resultant
|
||||
* array may have length zero.
|
||||
* @param s the symbolic name to lookup
|
||||
* @return a char array containing the name's value, or null if
|
||||
* there is no mapping for s.
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
char[] lookup(String s);
|
||||
|
||||
/**
|
||||
* Lookup the UnicodeMatcher associated with the given character, and
|
||||
* return it. Return <tt>null</tt> if not found.
|
||||
* @param ch a 32-bit code point from 0 to 0x10FFFF inclusive.
|
||||
* @return the UnicodeMatcher object represented by the given
|
||||
* character, or null if there is no mapping for ch.
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
UnicodeMatcher lookupMatcher(int ch);
|
||||
|
||||
/**
|
||||
* Parse a symbol reference name from the given string, starting
|
||||
* at the given position. If no valid symbol reference name is
|
||||
* found, return null and leave pos unchanged. That is, if the
|
||||
* character at pos cannot start a name, or if pos is at or after
|
||||
* text.length(), then return null. This indicates an isolated
|
||||
* SYMBOL_REF character.
|
||||
* @param text the text to parse for the name
|
||||
* @param pos on entry, the index of the first character to parse.
|
||||
* This is the character following the SYMBOL_REF character. On
|
||||
* exit, the index after the last parsed character. If the parse
|
||||
* failed, pos is unchanged on exit.
|
||||
* @param limit the index after the last character to be parsed.
|
||||
* @return the parsed name, or null if there is no valid symbolic
|
||||
* name at the given position.
|
||||
* @draft ICU 2.8
|
||||
* @deprecated This is a draft API and might change in a future release of ICU.
|
||||
*/
|
||||
@Deprecated
|
||||
String parseReference(String text, ParsePosition pos, int limit);
|
||||
}
|
||||
419
jdkSrc/jdk8/sun/text/normalizer/Trie.java
Normal file
419
jdkSrc/jdk8/sun/text/normalizer/Trie.java
Normal file
@@ -0,0 +1,419 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>A trie is a kind of compressed, serializable table of values
|
||||
* associated with Unicode code points (0..0x10ffff).</p>
|
||||
* <p>This class defines the basic structure of a trie and provides methods
|
||||
* to <b>retrieve the offsets to the actual data</b>.</p>
|
||||
* <p>Data will be the form of an array of basic types, char or int.</p>
|
||||
* <p>The actual data format will have to be specified by the user in the
|
||||
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
|
||||
* <p>This trie implementation is optimized for getting offset while walking
|
||||
* forward through a UTF-16 string.
|
||||
* Therefore, the simplest and fastest access macros are the
|
||||
* fromLead() and fromOffsetTrail() methods.
|
||||
* The fromBMP() method are a little more complicated; they get offsets even
|
||||
* for lead surrogate codepoints, while the fromLead() method get special
|
||||
* "folded" offsets for lead surrogate code units if there is relevant data
|
||||
* associated with them.
|
||||
* From such a folded offsets, an offset needs to be extracted to supply
|
||||
* to the fromOffsetTrail() methods.
|
||||
* To handle such supplementary codepoints, some offset information are kept
|
||||
* in the data.</p>
|
||||
* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
|
||||
* that offset from the folded value for the lead surrogate unit.</p>
|
||||
* <p>For examples of use, see com.ibm.icu.impl.CharTrie or
|
||||
* com.ibm.icu.impl.IntTrie.</p>
|
||||
* @author synwee
|
||||
* @see com.ibm.icu.impl.CharTrie
|
||||
* @see com.ibm.icu.impl.IntTrie
|
||||
* @since release 2.1, Jan 01 2002
|
||||
*/
|
||||
public abstract class Trie
|
||||
{
|
||||
// public class declaration ----------------------------------------
|
||||
|
||||
/**
|
||||
* Character data in com.ibm.impl.Trie have different user-specified format
|
||||
* for different purposes.
|
||||
* This interface specifies methods to be implemented in order for
|
||||
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
|
||||
* the data.
|
||||
*/
|
||||
public static interface DataManipulate
|
||||
{
|
||||
/**
|
||||
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
|
||||
* data
|
||||
* the index array offset of the indexes for that lead surrogate.
|
||||
* @param value data value for a surrogate from the trie, including the
|
||||
* folding offset
|
||||
* @return data offset or 0 if there is no data for the lead surrogate
|
||||
*/
|
||||
public int getFoldingOffset(int value);
|
||||
}
|
||||
|
||||
// default implementation
|
||||
private static class DefaultGetFoldingOffset implements DataManipulate {
|
||||
public int getFoldingOffset(int value) {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
// protected constructor -------------------------------------------
|
||||
|
||||
/**
|
||||
* Trie constructor for CharTrie use.
|
||||
* @param inputStream ICU data file input stream which contains the
|
||||
* trie
|
||||
* @param dataManipulate object containing the information to parse the
|
||||
* trie data
|
||||
* @throws IOException thrown when input stream does not have the
|
||||
* right header.
|
||||
*/
|
||||
protected Trie(InputStream inputStream,
|
||||
DataManipulate dataManipulate) throws IOException
|
||||
{
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
// Magic number to authenticate the data.
|
||||
int signature = input.readInt();
|
||||
m_options_ = input.readInt();
|
||||
|
||||
if (!checkHeader(signature)) {
|
||||
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
|
||||
}
|
||||
|
||||
if(dataManipulate != null) {
|
||||
m_dataManipulate_ = dataManipulate;
|
||||
} else {
|
||||
m_dataManipulate_ = new DefaultGetFoldingOffset();
|
||||
}
|
||||
m_isLatin1Linear_ = (m_options_ &
|
||||
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
|
||||
m_dataOffset_ = input.readInt();
|
||||
m_dataLength_ = input.readInt();
|
||||
unserialize(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trie constructor
|
||||
* @param index array to be used for index
|
||||
* @param options used by the trie
|
||||
* @param dataManipulate object containing the information to parse the
|
||||
* trie data
|
||||
*/
|
||||
protected Trie(char index[], int options, DataManipulate dataManipulate)
|
||||
{
|
||||
m_options_ = options;
|
||||
if(dataManipulate != null) {
|
||||
m_dataManipulate_ = dataManipulate;
|
||||
} else {
|
||||
m_dataManipulate_ = new DefaultGetFoldingOffset();
|
||||
}
|
||||
m_isLatin1Linear_ = (m_options_ &
|
||||
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
|
||||
m_index_ = index;
|
||||
m_dataOffset_ = m_index_.length;
|
||||
}
|
||||
|
||||
// protected data members ------------------------------------------
|
||||
|
||||
/**
|
||||
* Lead surrogate code points' index displacement in the index array.
|
||||
* 0x10000-0xd800=0x2800
|
||||
* 0x2800 >> INDEX_STAGE_1_SHIFT_
|
||||
*/
|
||||
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
|
||||
/**
|
||||
* Shift size for shifting right the input index. 1..9
|
||||
*/
|
||||
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
|
||||
/**
|
||||
* Shift size for shifting left the index array values.
|
||||
* Increases possible data size with 16-bit index values at the cost
|
||||
* of compactability.
|
||||
* This requires blocks of stage 2 data to be aligned by
|
||||
* DATA_GRANULARITY.
|
||||
* 0..INDEX_STAGE_1_SHIFT
|
||||
*/
|
||||
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
|
||||
/**
|
||||
* Number of data values in a stage 2 (data array) block.
|
||||
*/
|
||||
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
|
||||
/**
|
||||
* Mask for getting the lower bits from the input index.
|
||||
* DATA_BLOCK_LENGTH - 1.
|
||||
*/
|
||||
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
|
||||
/** Number of bits of a trail surrogate that are used in index table lookups. */
|
||||
protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
|
||||
/**
|
||||
* Number of index (stage 1) entries per lead surrogate.
|
||||
* Same as number of index entries for 1024 trail surrogates,
|
||||
* ==0x400>>INDEX_STAGE_1_SHIFT_
|
||||
*/
|
||||
protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
|
||||
/** Length of the BMP portion of the index (stage 1) array. */
|
||||
protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
|
||||
/**
|
||||
* Surrogate mask to use when shifting offset to retrieve supplementary
|
||||
* values
|
||||
*/
|
||||
protected static final int SURROGATE_MASK_ = 0x3FF;
|
||||
/**
|
||||
* Index or UTF16 characters
|
||||
*/
|
||||
protected char m_index_[];
|
||||
/**
|
||||
* Internal TrieValue which handles the parsing of the data value.
|
||||
* This class is to be implemented by the user
|
||||
*/
|
||||
protected DataManipulate m_dataManipulate_;
|
||||
/**
|
||||
* Start index of the data portion of the trie. CharTrie combines
|
||||
* index and data into a char array, so this is used to indicate the
|
||||
* initial offset to the data portion.
|
||||
* Note this index always points to the initial value.
|
||||
*/
|
||||
protected int m_dataOffset_;
|
||||
/**
|
||||
* Length of the data array
|
||||
*/
|
||||
protected int m_dataLength_;
|
||||
|
||||
// protected methods -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the surrogate pair points to.
|
||||
* @param lead lead surrogate
|
||||
* @param trail trailing surrogate
|
||||
* @return offset to data
|
||||
*/
|
||||
protected abstract int getSurrogateOffset(char lead, char trail);
|
||||
|
||||
/**
|
||||
* Gets the value at the argument index
|
||||
* @param index value at index will be retrieved
|
||||
* @return 32 bit value
|
||||
*/
|
||||
protected abstract int getValue(int index);
|
||||
|
||||
/**
|
||||
* Gets the default initial value
|
||||
* @return 32 bit value
|
||||
*/
|
||||
protected abstract int getInitialValue();
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which the index ch after variable offset
|
||||
* points to.
|
||||
* Note for locating a non-supplementary character data offset, calling
|
||||
* <p>
|
||||
* getRawOffset(0, ch);
|
||||
* </p>
|
||||
* will do. Otherwise if it is a supplementary character formed by
|
||||
* surrogates lead and trail. Then we would have to call getRawOffset()
|
||||
* with getFoldingIndexOffset(). See getSurrogateOffset().
|
||||
* @param offset index offset which ch is to start from
|
||||
* @param ch index to be used after offset
|
||||
* @return offset to the data
|
||||
*/
|
||||
protected final int getRawOffset(int offset, char ch)
|
||||
{
|
||||
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
|
||||
<< INDEX_STAGE_2_SHIFT_)
|
||||
+ (ch & INDEX_STAGE_3_MASK_);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to data which the BMP character points to
|
||||
* Treats a lead surrogate as a normal code point.
|
||||
* @param ch BMP character
|
||||
* @return offset to data
|
||||
*/
|
||||
protected final int getBMPOffset(char ch)
|
||||
{
|
||||
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
|
||||
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
|
||||
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
|
||||
: getRawOffset(0, ch);
|
||||
// using a getRawOffset(ch) makes no diff
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the offset to the data which this lead surrogate character points
|
||||
* to.
|
||||
* Data at the returned offset may contain folding offset information for
|
||||
* the next trailing surrogate character.
|
||||
* @param ch lead surrogate character
|
||||
* @return offset to data
|
||||
*/
|
||||
protected final int getLeadOffset(char ch)
|
||||
{
|
||||
return getRawOffset(0, ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal trie getter from a code point.
|
||||
* Could be faster(?) but longer with
|
||||
* if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
|
||||
* Gets the offset to data which the codepoint points to
|
||||
* @param ch codepoint
|
||||
* @return offset to data
|
||||
*/
|
||||
protected final int getCodePointOffset(int ch)
|
||||
{
|
||||
// if ((ch >> 16) == 0) slower
|
||||
if (ch < 0) {
|
||||
return -1;
|
||||
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
|
||||
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
|
||||
return getRawOffset(0, (char)ch);
|
||||
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
|
||||
// BMP codepoint
|
||||
return getBMPOffset((char)ch);
|
||||
} else if (ch <= UCharacter.MAX_VALUE) {
|
||||
// look at the construction of supplementary characters
|
||||
// trail forms the ends of it.
|
||||
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
|
||||
(char)(ch & SURROGATE_MASK_));
|
||||
} else {
|
||||
// return -1 // if there is an error, in this case we return
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Parses the inputstream and creates the trie index with it.</p>
|
||||
* <p>This is overwritten by the child classes.
|
||||
* @param inputStream input stream containing the trie information
|
||||
* @exception IOException thrown when data reading fails.
|
||||
*/
|
||||
protected void unserialize(InputStream inputStream) throws IOException
|
||||
{
|
||||
//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
|
||||
m_index_ = new char[m_dataOffset_];
|
||||
DataInputStream input = new DataInputStream(inputStream);
|
||||
for (int i = 0; i < m_dataOffset_; i ++) {
|
||||
m_index_[i] = input.readChar();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this is a 32 bit trie
|
||||
* @return true if options specifies this is a 32 bit trie
|
||||
*/
|
||||
protected final boolean isIntTrie()
|
||||
{
|
||||
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this is a 16 bit trie
|
||||
* @return true if this is a 16 bit trie
|
||||
*/
|
||||
protected final boolean isCharTrie()
|
||||
{
|
||||
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
|
||||
}
|
||||
|
||||
// private data members --------------------------------------------
|
||||
|
||||
/**
|
||||
* Latin 1 option mask
|
||||
*/
|
||||
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
|
||||
/**
|
||||
* Constant number to authenticate the byte block
|
||||
*/
|
||||
protected static final int HEADER_SIGNATURE_ = 0x54726965;
|
||||
/**
|
||||
* Header option formatting
|
||||
*/
|
||||
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
|
||||
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
|
||||
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
|
||||
|
||||
/**
|
||||
* Flag indicator for Latin quick access data block
|
||||
*/
|
||||
private boolean m_isLatin1Linear_;
|
||||
|
||||
/**
|
||||
* <p>Trie options field.</p>
|
||||
* <p>options bit field:<br>
|
||||
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
|
||||
* 8 0 = 16-bit data, 1=32-bit data<br>
|
||||
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
|
||||
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
|
||||
*/
|
||||
private int m_options_;
|
||||
|
||||
// private methods ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* Authenticates raw data header.
|
||||
* Checking the header information, signature and options.
|
||||
* @param signature This contains the options and type of a Trie
|
||||
* @return true if the header is authenticated valid
|
||||
*/
|
||||
private final boolean checkHeader(int signature)
|
||||
{
|
||||
// check the signature
|
||||
// Trie in big-endian US-ASCII (0x54726965).
|
||||
// Magic number to authenticate the data.
|
||||
if (signature != HEADER_SIGNATURE_) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
|
||||
INDEX_STAGE_1_SHIFT_ ||
|
||||
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
|
||||
HEADER_OPTIONS_SHIFT_MASK_)
|
||||
!= INDEX_STAGE_2_SHIFT_) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
548
jdkSrc/jdk8/sun/text/normalizer/TrieIterator.java
Normal file
548
jdkSrc/jdk8/sun/text/normalizer/TrieIterator.java
Normal file
@@ -0,0 +1,548 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* <p>Class enabling iteration of the values in a Trie.</p>
|
||||
* <p>Result of each iteration contains the interval of codepoints that have
|
||||
* the same value type and the value type itself.</p>
|
||||
* <p>The comparison of each codepoint value is done via extract(), which the
|
||||
* default implementation is to return the value as it is.</p>
|
||||
* <p>Method extract() can be overwritten to perform manipulations on
|
||||
* codepoint values in order to perform specialized comparison.</p>
|
||||
* <p>TrieIterator is designed to be a generic iterator for the CharTrie
|
||||
* and the IntTrie, hence to accommodate both types of data, the return
|
||||
* result will be in terms of int (32 bit) values.</p>
|
||||
* <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
|
||||
* <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
|
||||
* Internally, icu4c's utrie_enum performs all iterations in its body. In Java
|
||||
* sense, the caller will have to pass a object with a callback function
|
||||
* UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
|
||||
* uint32_t value) into utrie_enum. utrie_enum will then find ranges of
|
||||
* codepoints with the same value as determined by
|
||||
* UTrieEnumValue(const void *context, uint32_t value). for each range,
|
||||
* utrie_enum calls the callback function to perform a task. In this way,
|
||||
* icu4c performs the iteration within utrie_enum.
|
||||
* To follow the JDK model, icu4j is slightly different from icu4c.
|
||||
* Instead of requesting the caller to implement an object for a callback.
|
||||
* The caller will have to implement a subclass of TrieIterator, fleshing out
|
||||
* the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
|
||||
* the caller will have to code his own iteration and flesh out the task
|
||||
* (equivalent to UTrieEnumRange) to be performed in the iteration loop.
|
||||
* </p>
|
||||
* <p>There are basically 3 usage scenarios for porting:</p>
|
||||
* <p>1) UTrieEnumValue is the only implemented callback then just implement a
|
||||
* subclass of TrieIterator and override the extract(int) method. The
|
||||
* extract(int) method is analogus to UTrieEnumValue callback.
|
||||
* </p>
|
||||
* <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
|
||||
* a subclass of TrieIterator, override the extract method and iterate, e.g
|
||||
* </p>
|
||||
* <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
|
||||
* set);<br>
|
||||
* In Java :<br>
|
||||
* <pre>
|
||||
* class TrieIteratorImpl extends TrieIterator{
|
||||
* public TrieIteratorImpl(Trie data){
|
||||
* super(data);
|
||||
* }
|
||||
* public int extract(int value){
|
||||
* // port the implementation of _enumPropertyStartsValue here
|
||||
* }
|
||||
* }
|
||||
* ....
|
||||
* TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
|
||||
* while(fcdIter.next(result)) {
|
||||
* // port the implementation of _enumPropertyStartsRange
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* <p>3) UTrieEnumRange is the only implemented callback then just implement
|
||||
* the while loop, when utrie_enum is called
|
||||
* <pre>
|
||||
* // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
|
||||
* TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
|
||||
* while(fcdIter.next(result)){
|
||||
* set.add(result.start);
|
||||
* }
|
||||
* </pre>
|
||||
* </p>
|
||||
* @author synwee
|
||||
* @see com.ibm.icu.impl.Trie
|
||||
* @see com.ibm.icu.lang.UCharacterTypeIterator
|
||||
* @since release 2.1, Jan 17 2002
|
||||
*/
|
||||
public class TrieIterator implements RangeValueIterator
|
||||
{
|
||||
|
||||
// public constructor ---------------------------------------------
|
||||
|
||||
/**
|
||||
* TrieEnumeration constructor
|
||||
* @param trie to be used
|
||||
* @exception IllegalArgumentException throw when argument is null.
|
||||
*/
|
||||
public TrieIterator(Trie trie)
|
||||
{
|
||||
if (trie == null) {
|
||||
throw new IllegalArgumentException(
|
||||
"Argument trie cannot be null");
|
||||
}
|
||||
m_trie_ = trie;
|
||||
// synwee: check that extract belongs to the child class
|
||||
m_initialValue_ = extract(m_trie_.getInitialValue());
|
||||
reset();
|
||||
}
|
||||
|
||||
// public methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Returns true if we are not at the end of the iteration, false
|
||||
* otherwise.</p>
|
||||
* <p>The next set of codepoints with the same value type will be
|
||||
* calculated during this call and returned in the arguement element.</p>
|
||||
* @param element return result
|
||||
* @return true if we are not at the end of the iteration, false otherwise.
|
||||
* @exception NoSuchElementException - if no more elements exist.
|
||||
* @see com.ibm.icu.util.RangeValueIterator.Element
|
||||
*/
|
||||
public final boolean next(Element element)
|
||||
{
|
||||
if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
|
||||
return false;
|
||||
}
|
||||
if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
|
||||
calculateNextBMPElement(element)) {
|
||||
return true;
|
||||
}
|
||||
calculateNextSupplementaryElement(element);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the iterator to the beginning of the iteration
|
||||
*/
|
||||
public final void reset()
|
||||
{
|
||||
m_currentCodepoint_ = 0;
|
||||
m_nextCodepoint_ = 0;
|
||||
m_nextIndex_ = 0;
|
||||
m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
|
||||
if (m_nextBlock_ == 0) {
|
||||
m_nextValue_ = m_initialValue_;
|
||||
}
|
||||
else {
|
||||
m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
|
||||
}
|
||||
m_nextBlockIndex_ = 0;
|
||||
m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
|
||||
}
|
||||
|
||||
// protected methods ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Called by next() to extracts a 32 bit value from a trie value
|
||||
* used for comparison.
|
||||
* This method is to be overwritten if special manipulation is to be done
|
||||
* to retrieve a relevant comparison.
|
||||
* The default function is to return the value as it is.
|
||||
* @param value a value from the trie
|
||||
* @return extracted value
|
||||
*/
|
||||
protected int extract(int value)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
// private methods ------------------------------------------------
|
||||
|
||||
/**
|
||||
* Set the result values
|
||||
* @param element return result object
|
||||
* @param start codepoint of range
|
||||
* @param limit (end + 1) codepoint of range
|
||||
* @param value common value of range
|
||||
*/
|
||||
private final void setResult(Element element, int start, int limit,
|
||||
int value)
|
||||
{
|
||||
element.start = start;
|
||||
element.limit = limit;
|
||||
element.value = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finding the next element.
|
||||
* This method is called just before returning the result of
|
||||
* next().
|
||||
* We always store the next element before it is requested.
|
||||
* In the case that we have to continue calculations into the
|
||||
* supplementary planes, a false will be returned.
|
||||
* @param element return result object
|
||||
* @return true if the next range is found, false if we have to proceed to
|
||||
* the supplementary range.
|
||||
*/
|
||||
private final boolean calculateNextBMPElement(Element element)
|
||||
{
|
||||
int currentBlock = m_nextBlock_;
|
||||
int currentValue = m_nextValue_;
|
||||
m_currentCodepoint_ = m_nextCodepoint_;
|
||||
m_nextCodepoint_ ++;
|
||||
m_nextBlockIndex_ ++;
|
||||
if (!checkBlockDetail(currentValue)) {
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
return true;
|
||||
}
|
||||
// synwee check that next block index == 0 here
|
||||
// enumerate BMP - the main loop enumerates data blocks
|
||||
while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
|
||||
m_nextIndex_ ++;
|
||||
// because of the way the character is split to form the index
|
||||
// the lead surrogate and trail surrogate can not be in the
|
||||
// mid of a block
|
||||
if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
|
||||
// skip lead surrogate code units,
|
||||
// go to lead surrogate codepoints
|
||||
m_nextIndex_ = BMP_INDEX_LENGTH_;
|
||||
}
|
||||
else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
|
||||
// go back to regular BMP code points
|
||||
m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
|
||||
}
|
||||
|
||||
m_nextBlockIndex_ = 0;
|
||||
if (!checkBlock(currentBlock, currentValue)) {
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
m_nextCodepoint_ --; // step one back since this value has not been
|
||||
m_nextBlockIndex_ --; // retrieved yet.
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the next supplementary element.
|
||||
* For each entry in the trie, the value to be delivered is passed through
|
||||
* extract().
|
||||
* We always store the next element before it is requested.
|
||||
* Called after calculateNextBMP() completes its round of BMP characters.
|
||||
* There is a slight difference in the usage of m_currentCodepoint_
|
||||
* here as compared to calculateNextBMP(). Though both represents the
|
||||
* lower bound of the next element, in calculateNextBMP() it gets set
|
||||
* at the start of any loop, where-else, in calculateNextSupplementary()
|
||||
* since m_currentCodepoint_ already contains the lower bound of the
|
||||
* next element (passed down from calculateNextBMP()), we keep it till
|
||||
* the end before resetting it to the new value.
|
||||
* Note, if there are no more iterations, it will never get to here.
|
||||
* Blocked out by next().
|
||||
* @param element return result object
|
||||
*/
|
||||
private final void calculateNextSupplementaryElement(Element element)
|
||||
{
|
||||
int currentValue = m_nextValue_;
|
||||
int currentBlock = m_nextBlock_;
|
||||
m_nextCodepoint_ ++;
|
||||
m_nextBlockIndex_ ++;
|
||||
|
||||
if (UTF16.getTrailSurrogate(m_nextCodepoint_)
|
||||
!= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
|
||||
// this piece is only called when we are in the middle of a lead
|
||||
// surrogate block
|
||||
if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
m_currentCodepoint_ = m_nextCodepoint_;
|
||||
return;
|
||||
}
|
||||
// we have cleared one block
|
||||
m_nextIndex_ ++;
|
||||
m_nextTrailIndexOffset_ ++;
|
||||
if (!checkTrailBlock(currentBlock, currentValue)) {
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
m_currentCodepoint_ = m_nextCodepoint_;
|
||||
return;
|
||||
}
|
||||
}
|
||||
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
|
||||
// enumerate supplementary code points
|
||||
while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
|
||||
// lead surrogate access
|
||||
int leadBlock =
|
||||
m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
|
||||
Trie.INDEX_STAGE_2_SHIFT_;
|
||||
if (leadBlock == m_trie_.m_dataOffset_) {
|
||||
// no entries for a whole block of lead surrogates
|
||||
if (currentValue != m_initialValue_) {
|
||||
m_nextValue_ = m_initialValue_;
|
||||
m_nextBlock_ = 0;
|
||||
m_nextBlockIndex_ = 0;
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
m_currentCodepoint_ = m_nextCodepoint_;
|
||||
return;
|
||||
}
|
||||
|
||||
nextLead += DATA_BLOCK_LENGTH_;
|
||||
// number of total affected supplementary codepoints in one
|
||||
// block
|
||||
// this is not a simple addition of
|
||||
// DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
|
||||
// that we might have moved some of the codepoints
|
||||
m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
|
||||
(char)nextLead,
|
||||
(char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
|
||||
continue;
|
||||
}
|
||||
if (m_trie_.m_dataManipulate_ == null) {
|
||||
throw new NullPointerException(
|
||||
"The field DataManipulate in this Trie is null");
|
||||
}
|
||||
// enumerate trail surrogates for this lead surrogate
|
||||
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
|
||||
m_trie_.getValue(leadBlock +
|
||||
(nextLead & Trie.INDEX_STAGE_3_MASK_)));
|
||||
if (m_nextIndex_ <= 0) {
|
||||
// no data for this lead surrogate
|
||||
if (currentValue != m_initialValue_) {
|
||||
m_nextValue_ = m_initialValue_;
|
||||
m_nextBlock_ = 0;
|
||||
m_nextBlockIndex_ = 0;
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
m_currentCodepoint_ = m_nextCodepoint_;
|
||||
return;
|
||||
}
|
||||
m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
|
||||
} else {
|
||||
m_nextTrailIndexOffset_ = 0;
|
||||
if (!checkTrailBlock(currentBlock, currentValue)) {
|
||||
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
|
||||
currentValue);
|
||||
m_currentCodepoint_ = m_nextCodepoint_;
|
||||
return;
|
||||
}
|
||||
}
|
||||
nextLead ++;
|
||||
}
|
||||
|
||||
// deliver last range
|
||||
setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
|
||||
currentValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal block value calculations
|
||||
* Performs calculations on a data block to find codepoints in m_nextBlock_
|
||||
* after the index m_nextBlockIndex_ that has the same value.
|
||||
* Note m_*_ variables at this point is the next codepoint whose value
|
||||
* has not been calculated.
|
||||
* But when returned with false, it will be the last codepoint whose
|
||||
* value has been calculated.
|
||||
* @param currentValue the value which other codepoints are tested against
|
||||
* @return true if the whole block has the same value as currentValue or if
|
||||
* the whole block has been calculated, false otherwise.
|
||||
*/
|
||||
private final boolean checkBlockDetail(int currentValue)
|
||||
{
|
||||
while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
|
||||
m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
|
||||
m_nextBlockIndex_));
|
||||
if (m_nextValue_ != currentValue) {
|
||||
return false;
|
||||
}
|
||||
++ m_nextBlockIndex_;
|
||||
++ m_nextCodepoint_;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal block value calculations
|
||||
* Performs calculations on a data block to find codepoints in m_nextBlock_
|
||||
* that has the same value.
|
||||
* Will call checkBlockDetail() if highlevel check fails.
|
||||
* Note m_*_ variables at this point is the next codepoint whose value
|
||||
* has not been calculated.
|
||||
* @param currentBlock the initial block containing all currentValue
|
||||
* @param currentValue the value which other codepoints are tested against
|
||||
* @return true if the whole block has the same value as currentValue or if
|
||||
* the whole block has been calculated, false otherwise.
|
||||
*/
|
||||
private final boolean checkBlock(int currentBlock, int currentValue)
|
||||
{
|
||||
m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
|
||||
Trie.INDEX_STAGE_2_SHIFT_;
|
||||
if (m_nextBlock_ == currentBlock &&
|
||||
(m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
|
||||
// the block is the same as the previous one, filled with
|
||||
// currentValue
|
||||
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
|
||||
}
|
||||
else if (m_nextBlock_ == 0) {
|
||||
// this is the all-initial-value block
|
||||
if (currentValue != m_initialValue_) {
|
||||
m_nextValue_ = m_initialValue_;
|
||||
m_nextBlockIndex_ = 0;
|
||||
return false;
|
||||
}
|
||||
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
|
||||
}
|
||||
else {
|
||||
if (!checkBlockDetail(currentValue)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal block value calculations
|
||||
* Performs calculations on multiple data blocks for a set of trail
|
||||
* surrogates to find codepoints in m_nextBlock_ that has the same value.
|
||||
* Will call checkBlock() for internal block checks.
|
||||
* Note m_*_ variables at this point is the next codepoint whose value
|
||||
* has not been calculated.
|
||||
* @param currentBlock the initial block containing all currentValue
|
||||
* @param currentValue the value which other codepoints are tested against
|
||||
* @return true if the whole block has the same value as currentValue or if
|
||||
* the whole block has been calculated, false otherwise.
|
||||
*/
|
||||
private final boolean checkTrailBlock(int currentBlock,
|
||||
int currentValue)
|
||||
{
|
||||
// enumerate code points for this lead surrogate
|
||||
while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
|
||||
{
|
||||
// if we ever reach here, we are at the start of a new block
|
||||
m_nextBlockIndex_ = 0;
|
||||
// copy of most of the body of the BMP loop
|
||||
if (!checkBlock(currentBlock, currentValue)) {
|
||||
return false;
|
||||
}
|
||||
m_nextTrailIndexOffset_ ++;
|
||||
m_nextIndex_ ++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if we are beginning at the start of a initial block.
|
||||
* If we are then the rest of the codepoints in this initial block
|
||||
* has the same values.
|
||||
* We increment m_nextCodepoint_ and relevant data members if so.
|
||||
* This is used only in for the supplementary codepoints because
|
||||
* the offset to the trail indexes could be 0.
|
||||
* @return true if we are at the start of a initial block.
|
||||
*/
|
||||
private final boolean checkNullNextTrailIndex()
|
||||
{
|
||||
if (m_nextIndex_ <= 0) {
|
||||
m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
|
||||
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
|
||||
int leadBlock =
|
||||
m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
|
||||
Trie.INDEX_STAGE_2_SHIFT_;
|
||||
if (m_trie_.m_dataManipulate_ == null) {
|
||||
throw new NullPointerException(
|
||||
"The field DataManipulate in this Trie is null");
|
||||
}
|
||||
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
|
||||
m_trie_.getValue(leadBlock +
|
||||
(nextLead & Trie.INDEX_STAGE_3_MASK_)));
|
||||
m_nextIndex_ --;
|
||||
m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// private data members --------------------------------------------
|
||||
|
||||
/**
|
||||
* Size of the stage 1 BMP indexes
|
||||
*/
|
||||
private static final int BMP_INDEX_LENGTH_ =
|
||||
0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
|
||||
/**
|
||||
* Lead surrogate minimum value
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
|
||||
/**
|
||||
* Trail surrogate minimum value
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
|
||||
/**
|
||||
* Number of trail surrogate
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
|
||||
/**
|
||||
* Number of stage 1 indexes for supplementary calculations that maps to
|
||||
* each lead surrogate character.
|
||||
* See second pass into getRawOffset for the trail surrogate character.
|
||||
* 10 for significant number of bits for trail surrogates, 5 for what we
|
||||
* discard during shifting.
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
|
||||
1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
|
||||
/**
|
||||
* Number of data values in a stage 2 (data array) block.
|
||||
*/
|
||||
private static final int DATA_BLOCK_LENGTH_ =
|
||||
1 << Trie.INDEX_STAGE_1_SHIFT_;
|
||||
/**
|
||||
* Trie instance
|
||||
*/
|
||||
private Trie m_trie_;
|
||||
/**
|
||||
* Initial value for trie values
|
||||
*/
|
||||
private int m_initialValue_;
|
||||
/**
|
||||
* Next element results and data.
|
||||
*/
|
||||
private int m_currentCodepoint_;
|
||||
private int m_nextCodepoint_;
|
||||
private int m_nextValue_;
|
||||
private int m_nextIndex_;
|
||||
private int m_nextBlock_;
|
||||
private int m_nextBlockIndex_;
|
||||
private int m_nextTrailIndexOffset_;
|
||||
}
|
||||
179
jdkSrc/jdk8/sun/text/normalizer/UBiDiProps.java
Normal file
179
jdkSrc/jdk8/sun/text/normalizer/UBiDiProps.java
Normal file
@@ -0,0 +1,179 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
* file name: UBiDiProps.java
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2005jan16
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Low-level Unicode bidi/shaping properties access.
|
||||
* Java port of ubidi_props.h/.c.
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
public final class UBiDiProps {
|
||||
// constructors etc. --------------------------------------------------- ***
|
||||
|
||||
// port of ubidi_openProps()
|
||||
public UBiDiProps() throws IOException{
|
||||
InputStream is=ICUData.getStream(DATA_FILE_NAME);
|
||||
BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
|
||||
readData(b);
|
||||
b.close();
|
||||
is.close();
|
||||
|
||||
}
|
||||
|
||||
private void readData(InputStream is) throws IOException {
|
||||
DataInputStream inputStream=new DataInputStream(is);
|
||||
|
||||
// read the header
|
||||
ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
|
||||
|
||||
// read indexes[]
|
||||
int i, count;
|
||||
count=inputStream.readInt();
|
||||
if(count<IX_INDEX_TOP) {
|
||||
throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
|
||||
}
|
||||
indexes=new int[count];
|
||||
|
||||
indexes[0]=count;
|
||||
for(i=1; i<count; ++i) {
|
||||
indexes[i]=inputStream.readInt();
|
||||
}
|
||||
|
||||
// read the trie
|
||||
trie=new CharTrie(inputStream, null);
|
||||
|
||||
// read mirrors[]
|
||||
count=indexes[IX_MIRROR_LENGTH];
|
||||
if(count>0) {
|
||||
mirrors=new int[count];
|
||||
for(i=0; i<count; ++i) {
|
||||
mirrors[i]=inputStream.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
// read jgArray[]
|
||||
count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START];
|
||||
jgArray=new byte[count];
|
||||
for(i=0; i<count; ++i) {
|
||||
jgArray[i]=inputStream.readByte();
|
||||
}
|
||||
}
|
||||
|
||||
// implement ICUBinary.Authenticate
|
||||
private final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0]==1 &&
|
||||
version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_;
|
||||
}
|
||||
}
|
||||
|
||||
// UBiDiProps singleton
|
||||
private static UBiDiProps gBdp=null;
|
||||
|
||||
// port of ubidi_getSingleton()
|
||||
public static final synchronized UBiDiProps getSingleton() throws IOException {
|
||||
if(gBdp==null) {
|
||||
gBdp=new UBiDiProps();
|
||||
}
|
||||
return gBdp;
|
||||
}
|
||||
|
||||
// UBiDiProps dummy singleton
|
||||
private static UBiDiProps gBdpDummy=null;
|
||||
|
||||
private UBiDiProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
|
||||
indexes=new int[IX_TOP];
|
||||
indexes[0]=IX_TOP;
|
||||
trie=new CharTrie(0, 0, null); // dummy trie, always returns 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a singleton dummy object, one that works with no real data.
|
||||
* This can be used when the real data is not available.
|
||||
* Using the dummy can reduce checks for available data after an initial failure.
|
||||
* Port of ucase_getDummy().
|
||||
*/
|
||||
public static final synchronized UBiDiProps getDummy() {
|
||||
if(gBdpDummy==null) {
|
||||
gBdpDummy=new UBiDiProps(true);
|
||||
}
|
||||
return gBdpDummy;
|
||||
}
|
||||
|
||||
public final int getClass(int c) {
|
||||
return getClassFromProps(trie.getCodePointValue(c));
|
||||
}
|
||||
|
||||
// data members -------------------------------------------------------- ***
|
||||
private int indexes[];
|
||||
private int mirrors[];
|
||||
private byte jgArray[];
|
||||
|
||||
private CharTrie trie;
|
||||
|
||||
// data format constants ----------------------------------------------- ***
|
||||
private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu";
|
||||
|
||||
/* format "BiDi" */
|
||||
private static final byte FMT[]={ 0x42, 0x69, 0x44, 0x69 };
|
||||
|
||||
/* indexes into indexes[] */
|
||||
private static final int IX_INDEX_TOP=0;
|
||||
private static final int IX_MIRROR_LENGTH=3;
|
||||
|
||||
private static final int IX_JG_START=4;
|
||||
private static final int IX_JG_LIMIT=5;
|
||||
|
||||
private static final int IX_TOP=16;
|
||||
|
||||
private static final int CLASS_MASK= 0x0000001f;
|
||||
|
||||
private static final int getClassFromProps(int props) {
|
||||
return props&CLASS_MASK;
|
||||
}
|
||||
|
||||
}
|
||||
431
jdkSrc/jdk8/sun/text/normalizer/UCharacter.java
Normal file
431
jdkSrc/jdk8/sun/text/normalizer/UCharacter.java
Normal file
@@ -0,0 +1,431 @@
|
||||
/*
|
||||
* Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The UCharacter class provides extensions to the
|
||||
* <a href="https://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html">
|
||||
* java.lang.Character</a> class. These extensions provide support for
|
||||
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
|
||||
* class, provide support for supplementary characters (those with code
|
||||
* points above U+FFFF).
|
||||
* Each ICU release supports the latest version of Unicode available at that time.
|
||||
* </p>
|
||||
* <p>
|
||||
* Code points are represented in these API using ints. While it would be
|
||||
* more convenient in Java to have a separate primitive datatype for them,
|
||||
* ints suffice in the meantime.
|
||||
* </p>
|
||||
* <p>
|
||||
* To use this class please add the jar file name icu4j.jar to the
|
||||
* class path, since it contains data files which supply the information used
|
||||
* by this file.<br>
|
||||
* E.g. In Windows <br>
|
||||
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
|
||||
* Otherwise, another method would be to copy the files uprops.dat and
|
||||
* unames.icu from the icu4j source subdirectory
|
||||
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
|
||||
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
|
||||
* </p>
|
||||
* <p>
|
||||
* Aside from the additions for UTF-16 support, and the updated Unicode
|
||||
* properties, the main differences between UCharacter and Character are:
|
||||
* <ul>
|
||||
* <li> UCharacter is not designed to be a char wrapper and does not have
|
||||
* APIs to which involves management of that single char.<br>
|
||||
* These include:
|
||||
* <ul>
|
||||
* <li> char charValue(),
|
||||
* <li> int compareTo(java.lang.Character, java.lang.Character), etc.
|
||||
* </ul>
|
||||
* <li> UCharacter does not include Character APIs that are deprecated, nor
|
||||
* does it include the Java-specific character information, such as
|
||||
* boolean isJavaIdentifierPart(char ch).
|
||||
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
|
||||
* values '10' - '35'. UCharacter also does this in digit and
|
||||
* getNumericValue, to adhere to the java semantics of these
|
||||
* methods. New methods unicodeDigit, and
|
||||
* getUnicodeNumericValue do not treat the above code points
|
||||
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
|
||||
* </ul>
|
||||
* <p>
|
||||
* Further detail differences can be determined from the program
|
||||
* <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
|
||||
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
|
||||
* </p>
|
||||
* <p>
|
||||
* In addition to Java compatibility functions, which calculate derived properties,
|
||||
* this API provides low-level access to the Unicode Character Database.
|
||||
* </p>
|
||||
* <p>
|
||||
* Unicode assigns each code point (not just assigned character) values for
|
||||
* many properties.
|
||||
* Most of them are simple boolean flags, or constants from a small enumerated list.
|
||||
* For some properties, values are strings or other relatively more complex types.
|
||||
* </p>
|
||||
* <p>
|
||||
* For more information see
|
||||
* "About the Unicode Character Database" (http://www.unicode.org/ucd/)
|
||||
* and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
|
||||
* </p>
|
||||
* <p>
|
||||
* There are also functions that provide easy migration from C/POSIX functions
|
||||
* like isblank(). Their use is generally discouraged because the C/POSIX
|
||||
* standards do not define their semantics beyond the ASCII range, which means
|
||||
* that different implementations exhibit very different behavior.
|
||||
* Instead, Unicode properties should be used directly.
|
||||
* </p>
|
||||
* <p>
|
||||
* There are also only a few, broad C/POSIX character classes, and they tend
|
||||
* to be used for conflicting purposes. For example, the "isalpha()" class
|
||||
* is sometimes used to determine word boundaries, while a more sophisticated
|
||||
* approach would at least distinguish initial letters from continuation
|
||||
* characters (the latter including combining marks).
|
||||
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
|
||||
* Another example: There is no "istitle()" class for titlecase characters.
|
||||
* </p>
|
||||
* <p>
|
||||
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
|
||||
* ICU implements them according to the Standard Recommendations in
|
||||
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
|
||||
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
* </p>
|
||||
* <p>
|
||||
* API access for C/POSIX character classes is as follows:
|
||||
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
|
||||
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
|
||||
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
|
||||
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
|
||||
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
|
||||
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
|
||||
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
|
||||
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
|
||||
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
|
||||
* - cntrl: getType(c)==CONTROL
|
||||
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
|
||||
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
|
||||
* </p>
|
||||
* <p>
|
||||
* The C/POSIX character classes are also available in UnicodeSet patterns,
|
||||
* using patterns like [:graph:] or \p{graph}.
|
||||
* </p>
|
||||
* <p>
|
||||
* Note: There are several ICU (and Java) whitespace functions.
|
||||
* Comparison:
|
||||
* - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
|
||||
* most of general categories "Z" (separators) + most whitespace ISO controls
|
||||
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
|
||||
* - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
|
||||
* - isSpaceChar: just Z (including no-break spaces)
|
||||
* </p>
|
||||
* <p>
|
||||
* This class is not subclassable
|
||||
* </p>
|
||||
* @author Syn Wee Quek
|
||||
* @stable ICU 2.1
|
||||
* @see com.ibm.icu.lang.UCharacterEnums
|
||||
*/
|
||||
|
||||
public final class UCharacter
|
||||
{
|
||||
|
||||
/**
|
||||
* Numeric Type constants.
|
||||
* @see UProperty#NUMERIC_TYPE
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static interface NumericType
|
||||
{
|
||||
/**
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int DECIMAL = 1;
|
||||
}
|
||||
|
||||
// public data members -----------------------------------------------
|
||||
|
||||
/**
|
||||
* The lowest Unicode code point value.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
|
||||
|
||||
/**
|
||||
* The highest Unicode code point value (scalar value) according to the
|
||||
* Unicode Standard.
|
||||
* This is a 21-bit value (21 bits, rounded up).<br>
|
||||
* Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
|
||||
|
||||
/**
|
||||
* The minimum value for Supplementary code points
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SUPPLEMENTARY_MIN_VALUE =
|
||||
UTF16.SUPPLEMENTARY_MIN_VALUE;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Retrieves the numeric value of a decimal digit code point.
|
||||
* <br>This method observes the semantics of
|
||||
* <code>java.lang.Character.digit()</code>. Note that this
|
||||
* will return positive values for code points for which isDigit
|
||||
* returns false, just like java.lang.Character.
|
||||
* <br><em>Semantic Change:</em> In release 1.3.1 and
|
||||
* prior, this did not treat the European letters as having a
|
||||
* digit value, and also treated numeric letters and other numbers as
|
||||
* digits.
|
||||
* This has been changed to conform to the java semantics.
|
||||
* <br>A code point is a valid digit if and only if:
|
||||
* <ul>
|
||||
* <li>ch is a decimal digit or one of the european letters, and
|
||||
* <li>the value of ch is less than the specified radix.
|
||||
* </ul>
|
||||
* @param ch the code point to query
|
||||
* @param radix the radix
|
||||
* @return the numeric value represented by the code point in the
|
||||
* specified radix, or -1 if the code point is not a decimal digit
|
||||
* or if its value is too large for the radix
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int digit(int ch, int radix)
|
||||
{
|
||||
// when ch is out of bounds getProperty == 0
|
||||
int props = getProperty(ch);
|
||||
int value;
|
||||
if (getNumericType(props) == NumericType.DECIMAL) {
|
||||
value = UCharacterProperty.getUnsignedValue(props);
|
||||
} else {
|
||||
value = getEuropeanDigit(ch);
|
||||
}
|
||||
return (0 <= value && value < radix) ? value : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Bidirection property of a code point.
|
||||
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
|
||||
* property.<br>
|
||||
* Result returned belongs to the interface
|
||||
* <a href=UCharacterDirection.html>UCharacterDirection</a>
|
||||
* @param ch the code point to be determined its direction
|
||||
* @return direction constant from UCharacterDirection.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getDirection(int ch)
|
||||
{
|
||||
return gBdp.getClass(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a code point corresponding to the two UTF16 characters.
|
||||
* @param lead the lead char
|
||||
* @param trail the trail char
|
||||
* @return code point if surrogate characters are valid.
|
||||
* @exception IllegalArgumentException thrown when argument characters do
|
||||
* not form a valid codepoint
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getCodePoint(char lead, char trail)
|
||||
{
|
||||
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, trail);
|
||||
}
|
||||
throw new IllegalArgumentException("Illegal surrogate characters");
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the "age" of the code point.</p>
|
||||
* <p>The "age" is the Unicode version when the code point was first
|
||||
* designated (as a non-character or for Private Use) or assigned a
|
||||
* character.
|
||||
* <p>This can be useful to avoid emitting code points to receiving
|
||||
* processes that do not accept newer characters.</p>
|
||||
* <p>The data is from the UCD file DerivedAge.txt.</p>
|
||||
* @param ch The code point.
|
||||
* @return the Unicode version number
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static VersionInfo getAge(int ch)
|
||||
{
|
||||
if (ch < MIN_VALUE || ch > MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Codepoint out of bounds");
|
||||
}
|
||||
return PROPERTY_.getAge(ch);
|
||||
}
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Database storing the sets of character property
|
||||
*/
|
||||
private static final UCharacterProperty PROPERTY_;
|
||||
/**
|
||||
* For optimization
|
||||
*/
|
||||
private static final char[] PROPERTY_TRIE_INDEX_;
|
||||
private static final char[] PROPERTY_TRIE_DATA_;
|
||||
private static final int PROPERTY_INITIAL_VALUE_;
|
||||
|
||||
private static final UBiDiProps gBdp;
|
||||
|
||||
// block to initialise character property database
|
||||
static
|
||||
{
|
||||
try
|
||||
{
|
||||
PROPERTY_ = UCharacterProperty.getInstance();
|
||||
PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
|
||||
PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
|
||||
PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new MissingResourceException(e.getMessage(),"","");
|
||||
}
|
||||
|
||||
UBiDiProps bdp;
|
||||
try {
|
||||
bdp=UBiDiProps.getSingleton();
|
||||
} catch(IOException e) {
|
||||
bdp=UBiDiProps.getDummy();
|
||||
}
|
||||
gBdp=bdp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Shift to get numeric type
|
||||
*/
|
||||
private static final int NUMERIC_TYPE_SHIFT_ = 5;
|
||||
/**
|
||||
* Mask to get numeric type
|
||||
*/
|
||||
private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
|
||||
|
||||
// private methods ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* Getting the digit values of characters like 'A' - 'Z', normal,
|
||||
* half-width and full-width. This method assumes that the other digit
|
||||
* characters are checked by the calling method.
|
||||
* @param ch character to test
|
||||
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
|
||||
* its corresponding digit will be returned.
|
||||
*/
|
||||
private static int getEuropeanDigit(int ch) {
|
||||
if ((ch > 0x7a && ch < 0xff21)
|
||||
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|
||||
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
|
||||
return -1;
|
||||
}
|
||||
if (ch <= 0x7a) {
|
||||
// ch >= 0x41 or ch < 0x61
|
||||
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
|
||||
}
|
||||
// ch >= 0xff21
|
||||
if (ch <= 0xff3a) {
|
||||
return ch + 10 - 0xff21;
|
||||
}
|
||||
// ch >= 0xff41 && ch <= 0xff5a
|
||||
return ch + 10 - 0xff41;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the numeric type of the property argument
|
||||
* @param props 32 bit property
|
||||
* @return the numeric type
|
||||
*/
|
||||
private static int getNumericType(int props)
|
||||
{
|
||||
return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the property value at the index.
|
||||
* This is optimized.
|
||||
* Note this is alittle different from CharTrie the index m_trieData_
|
||||
* is never negative.
|
||||
* This is a duplicate of UCharacterProperty.getProperty. For optimization
|
||||
* purposes, this method calls the trie data directly instead of through
|
||||
* UCharacterProperty.getProperty.
|
||||
* @param ch code point whose property value is to be retrieved
|
||||
* @return property value of code point
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
private static final int getProperty(int ch)
|
||||
{
|
||||
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|
||||
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
|
||||
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
|
||||
// BMP codepoint 0000..D7FF or DC00..FFFF
|
||||
try { // using try for ch < 0 is faster than using an if statement
|
||||
return PROPERTY_TRIE_DATA_[
|
||||
(PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
|
||||
+ (ch & 0x1f)];
|
||||
} catch (ArrayIndexOutOfBoundsException e) {
|
||||
return PROPERTY_INITIAL_VALUE_;
|
||||
}
|
||||
}
|
||||
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
// lead surrogate D800..DBFF
|
||||
return PROPERTY_TRIE_DATA_[
|
||||
(PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
|
||||
+ (ch & 0x1f)];
|
||||
}
|
||||
// for optimization
|
||||
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
// supplementary code point 10000..10FFFF
|
||||
// look at the construction of supplementary characters
|
||||
// trail forms the ends of it.
|
||||
return PROPERTY_.m_trie_.getSurrogateValue(
|
||||
UTF16.getLeadSurrogate(ch),
|
||||
(char)(ch & 0x3ff));
|
||||
}
|
||||
// return m_dataOffset_ if there is an error, in this case we return
|
||||
// the default value: m_initialValue_
|
||||
// we cannot assume that m_initialValue_ is at offset 0
|
||||
// this is for optimization.
|
||||
return PROPERTY_INITIAL_VALUE_;
|
||||
}
|
||||
|
||||
}
|
||||
292
jdkSrc/jdk8/sun/text/normalizer/UCharacterIterator.java
Normal file
292
jdkSrc/jdk8/sun/text/normalizer/UCharacterIterator.java
Normal file
@@ -0,0 +1,292 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* Abstract class that defines an API for iteration on text objects.This is an
|
||||
* interface for forward and backward iteration and random access into a text
|
||||
* object. Forward iteration is done with post-increment and backward iteration
|
||||
* is done with pre-decrement semantics, while the
|
||||
* <code>java.text.CharacterIterator</code> interface methods provided forward
|
||||
* iteration with "pre-increment" and backward iteration with pre-decrement
|
||||
* semantics. This API is more efficient for forward iteration over code points.
|
||||
* The other major difference is that this API can do both code unit and code point
|
||||
* iteration, <code>java.text.CharacterIterator</code> can only iterate over
|
||||
* code units and is limited to BMP (0 - 0xFFFF)
|
||||
* @author Ram
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract class UCharacterIterator
|
||||
implements Cloneable {
|
||||
|
||||
/**
|
||||
* Protected default constructor for the subclasses
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
protected UCharacterIterator(){
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text.
|
||||
* Moved from UForwardCharacterIterator.java
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
|
||||
// static final methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* source string.
|
||||
* @param source a string
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(String source){
|
||||
return new ReplaceableUCharacterIterator(source);
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* source StringBuffer.
|
||||
* @param source an string buffer of UTF-16 code units
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(StringBuffer source){
|
||||
return new ReplaceableUCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* CharacterIterator.
|
||||
* @param source a valid CharacterIterator object.
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(CharacterIterator source){
|
||||
return new CharacterIteratorWrapper(source);
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the code unit at the current index. If index is out
|
||||
* of range, returns DONE. Index is not changed.
|
||||
* @return current code unit
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int current();
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
* @return length of the text
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getLength();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the current index in text.
|
||||
* @return current index in text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getIndex();
|
||||
|
||||
|
||||
/**
|
||||
* Returns the UTF16 code unit at index, and increments to the next
|
||||
* code unit (post-increment semantics). If index is out of
|
||||
* range, DONE is returned, and the iterator is reset to the limit
|
||||
* of the text.
|
||||
* @return the next UTF16 code unit, or DONE if the index is at the limit
|
||||
* of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int next();
|
||||
|
||||
/**
|
||||
* Returns the code point at index, and increments to the next code
|
||||
* point (post-increment semantics). If index does not point to a
|
||||
* valid surrogate pair, the behavior is the same as
|
||||
* <code>next()</code>. Otherwise the iterator is incremented past
|
||||
* the surrogate pair, and the code point represented by the pair
|
||||
* is returned.
|
||||
* @return the next codepoint in text, or DONE if the index is at
|
||||
* the limit of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public int nextCodePoint(){
|
||||
int ch1 = next();
|
||||
if(UTF16.isLeadSurrogate((char)ch1)){
|
||||
int ch2 = next();
|
||||
if(UTF16.isTrailSurrogate((char)ch2)){
|
||||
return UCharacterProperty.getRawSupplementary((char)ch1,
|
||||
(char)ch2);
|
||||
}else if (ch2 != DONE) {
|
||||
// unmatched surrogate so back out
|
||||
previous();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decrement to the position of the previous code unit in the
|
||||
* text, and return it (pre-decrement semantics). If the
|
||||
* resulting index is less than 0, the index is reset to 0 and
|
||||
* DONE is returned.
|
||||
* @return the previous code unit in the text, or DONE if the new
|
||||
* index is before the start of the text.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int previous();
|
||||
|
||||
/**
|
||||
* Sets the index to the specified index in the text.
|
||||
* @param index the index within the text.
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid index is
|
||||
* supplied
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract void setIndex(int index);
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Fills the buffer with the underlying text storage of the iterator
|
||||
* If the buffer capacity is not enough a exception is thrown. The capacity
|
||||
* of the fill in buffer should at least be equal to length of text in the
|
||||
* iterator obtained by calling <code>getLength()</code>.
|
||||
* <b>Usage:</b>
|
||||
*
|
||||
* <code>
|
||||
* <pre>
|
||||
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
|
||||
* char[] buf = new char[iter.getLength()];
|
||||
* iter.getText(buf);
|
||||
*
|
||||
* OR
|
||||
* char[] buf= new char[1];
|
||||
* int len = 0;
|
||||
* for(;;){
|
||||
* try{
|
||||
* len = iter.getText(buf);
|
||||
* break;
|
||||
* }catch(IndexOutOfBoundsException e){
|
||||
* buf = new char[iter.getLength()];
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* </code>
|
||||
*
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @param offset the position within the array to start putting the data.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBounds exception if there is not enough
|
||||
* room after offset in the array, or if offset < 0.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public abstract int getText(char[] fillIn, int offset);
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Convenience override for <code>getText(char[], int)</code> that provides
|
||||
* an offset of 0.
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBounds exception if there is not enough
|
||||
* room in the array.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public final int getText(char[] fillIn) {
|
||||
return getText(fillIn, 0);
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Convenience method for returning the underlying text storage as as string
|
||||
* @return the underlying text storage in the iterator as a string
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public String getText() {
|
||||
char[] text = new char[getLength()];
|
||||
getText(text);
|
||||
return new String(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the current position by the number of code units
|
||||
* specified, either forward or backward depending on the sign
|
||||
* of delta (positive or negative respectively). If the resulting
|
||||
* index would be less than zero, the index is set to zero, and if
|
||||
* the resulting index would be greater than limit, the index is
|
||||
* set to limit.
|
||||
*
|
||||
* @param delta the number of code units to move the current
|
||||
* index.
|
||||
* @return the new index.
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid index is
|
||||
* supplied
|
||||
* @stable ICU 2.4
|
||||
*
|
||||
*/
|
||||
public int moveIndex(int delta) {
|
||||
int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
|
||||
setIndex(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, independent from other iterators.
|
||||
* If it is not possible to clone the iterator, returns null.
|
||||
* @return copy of this iterator
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
public Object clone() throws CloneNotSupportedException{
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
}
|
||||
369
jdkSrc/jdk8/sun/text/normalizer/UCharacterProperty.java
Normal file
369
jdkSrc/jdk8/sun/text/normalizer/UCharacterProperty.java
Normal file
@@ -0,0 +1,369 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
/**
|
||||
* <p>Internal class used for Unicode character property database.</p>
|
||||
* <p>This classes store binary data read from uprops.icu.
|
||||
* It does not have the capability to parse the data into more high-level
|
||||
* information. It only returns bytes of information when required.</p>
|
||||
* <p>Due to the form most commonly used for retrieval, array of char is used
|
||||
* to store the binary data.</p>
|
||||
* <p>UCharacterPropertyDB also contains information on accessing indexes to
|
||||
* significant points in the binary data.</p>
|
||||
* <p>Responsibility for molding the binary data into more meaning form lies on
|
||||
* <a href=UCharacter.html>UCharacter</a>.</p>
|
||||
* @author Syn Wee Quek
|
||||
* @since release 2.1, february 1st 2002
|
||||
*/
|
||||
|
||||
public final class UCharacterProperty
|
||||
{
|
||||
// public data members -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Trie data
|
||||
*/
|
||||
public CharTrie m_trie_;
|
||||
/**
|
||||
* Optimization
|
||||
* CharTrie index array
|
||||
*/
|
||||
public char[] m_trieIndex_;
|
||||
/**
|
||||
* Optimization
|
||||
* CharTrie data array
|
||||
*/
|
||||
public char[] m_trieData_;
|
||||
/**
|
||||
* Optimization
|
||||
* CharTrie data offset
|
||||
*/
|
||||
public int m_trieInitialValue_;
|
||||
/**
|
||||
* Unicode version
|
||||
*/
|
||||
public VersionInfo m_unicodeVersion_;
|
||||
|
||||
// uprops.h enum UPropertySource --------------------------------------- ***
|
||||
|
||||
/** From uchar.c/uprops.icu properties vectors trie */
|
||||
public static final int SRC_PROPSVEC=2;
|
||||
/** One more than the highest UPropertySource (SRC_) constant. */
|
||||
public static final int SRC_COUNT=9;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Java friends implementation
|
||||
*/
|
||||
public void setIndexData(CharTrie.FriendAgent friendagent)
|
||||
{
|
||||
m_trieIndex_ = friendagent.getPrivateIndex();
|
||||
m_trieData_ = friendagent.getPrivateData();
|
||||
m_trieInitialValue_ = friendagent.getPrivateInitialValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the property value at the index.
|
||||
* This is optimized.
|
||||
* Note this is alittle different from CharTrie the index m_trieData_
|
||||
* is never negative.
|
||||
* @param ch code point whose property value is to be retrieved
|
||||
* @return property value of code point
|
||||
*/
|
||||
public final int getProperty(int ch)
|
||||
{
|
||||
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|
||||
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
|
||||
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
|
||||
// BMP codepoint 0000..D7FF or DC00..FFFF
|
||||
// optimized
|
||||
try { // using try for ch < 0 is faster than using an if statement
|
||||
return m_trieData_[
|
||||
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
|
||||
<< Trie.INDEX_STAGE_2_SHIFT_)
|
||||
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
|
||||
} catch (ArrayIndexOutOfBoundsException e) {
|
||||
return m_trieInitialValue_;
|
||||
}
|
||||
}
|
||||
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
// lead surrogate D800..DBFF
|
||||
return m_trieData_[
|
||||
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
|
||||
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
|
||||
<< Trie.INDEX_STAGE_2_SHIFT_)
|
||||
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
|
||||
}
|
||||
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
|
||||
// supplementary code point 10000..10FFFF
|
||||
// look at the construction of supplementary characters
|
||||
// trail forms the ends of it.
|
||||
return m_trie_.getSurrogateValue(
|
||||
UTF16.getLeadSurrogate(ch),
|
||||
(char)(ch & Trie.SURROGATE_MASK_));
|
||||
}
|
||||
// ch is out of bounds
|
||||
// return m_dataOffset_ if there is an error, in this case we return
|
||||
// the default value: m_initialValue_
|
||||
// we cannot assume that m_initialValue_ is at offset 0
|
||||
// this is for optimization.
|
||||
return m_trieInitialValue_;
|
||||
|
||||
// this all is an inlined form of return m_trie_.getCodePointValue(ch);
|
||||
}
|
||||
|
||||
/**
|
||||
* Getting the unsigned numeric value of a character embedded in the property
|
||||
* argument
|
||||
* @param prop the character
|
||||
* @return unsigned numberic value
|
||||
*/
|
||||
public static int getUnsignedValue(int prop)
|
||||
{
|
||||
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the unicode additional properties.
|
||||
* C version getUnicodeProperties.
|
||||
* @param codepoint codepoint whose additional properties is to be
|
||||
* retrieved
|
||||
* @param column
|
||||
* @return unicode properties
|
||||
*/
|
||||
public int getAdditional(int codepoint, int column) {
|
||||
if (column == -1) {
|
||||
return getProperty(codepoint);
|
||||
}
|
||||
if (column < 0 || column >= m_additionalColumnsCount_) {
|
||||
return 0;
|
||||
}
|
||||
return m_additionalVectors_[
|
||||
m_additionalTrie_.getCodePointValue(codepoint) + column];
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Get the "age" of the code point.</p>
|
||||
* <p>The "age" is the Unicode version when the code point was first
|
||||
* designated (as a non-character or for Private Use) or assigned a
|
||||
* character.</p>
|
||||
* <p>This can be useful to avoid emitting code points to receiving
|
||||
* processes that do not accept newer characters.</p>
|
||||
* <p>The data is from the UCD file DerivedAge.txt.</p>
|
||||
* <p>This API does not check the validity of the codepoint.</p>
|
||||
* @param codepoint The code point.
|
||||
* @return the Unicode version number
|
||||
*/
|
||||
public VersionInfo getAge(int codepoint)
|
||||
{
|
||||
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
|
||||
return VersionInfo.getInstance(
|
||||
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
|
||||
version & LAST_NIBBLE_MASK_, 0, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Forms a supplementary code point from the argument character<br>
|
||||
* Note this is for internal use hence no checks for the validity of the
|
||||
* surrogate characters are done
|
||||
* @param lead lead surrogate character
|
||||
* @param trail trailing surrogate character
|
||||
* @return code point of the supplementary character
|
||||
*/
|
||||
public static int getRawSupplementary(char lead, char trail)
|
||||
{
|
||||
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the property data and initialize the UCharacterProperty instance.
|
||||
* @throws MissingResourceException when data is missing or data has been corrupted
|
||||
*/
|
||||
public static UCharacterProperty getInstance()
|
||||
{
|
||||
if(INSTANCE_ == null) {
|
||||
try {
|
||||
INSTANCE_ = new UCharacterProperty();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new MissingResourceException(e.getMessage(),"","");
|
||||
}
|
||||
}
|
||||
return INSTANCE_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the argument c is to be treated as a white space in ICU
|
||||
* rules. Usually ICU rule white spaces are ignored unless quoted.
|
||||
* Equivalent to test for Pattern_White_Space Unicode property.
|
||||
* Stable set of characters, won't change.
|
||||
* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
|
||||
* @param c codepoint to check
|
||||
* @return true if c is a ICU white space
|
||||
*/
|
||||
public static boolean isRuleWhiteSpace(int c)
|
||||
{
|
||||
/* "white space" in the sense of ICU rule parsers
|
||||
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
|
||||
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
|
||||
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
|
||||
Equivalent to test for Pattern_White_Space Unicode property.
|
||||
*/
|
||||
return (c >= 0x0009 && c <= 0x2029 &&
|
||||
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
|
||||
c == 0x200E || c == 0x200F || c >= 0x2028));
|
||||
}
|
||||
|
||||
// protected variables -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Extra property trie
|
||||
*/
|
||||
CharTrie m_additionalTrie_;
|
||||
/**
|
||||
* Extra property vectors, 1st column for age and second for binary
|
||||
* properties.
|
||||
*/
|
||||
int m_additionalVectors_[];
|
||||
/**
|
||||
* Number of additional columns
|
||||
*/
|
||||
int m_additionalColumnsCount_;
|
||||
/**
|
||||
* Maximum values for block, bits used as in vector word
|
||||
* 0
|
||||
*/
|
||||
int m_maxBlockScriptValue_;
|
||||
/**
|
||||
* Maximum values for script, bits used as in vector word
|
||||
* 0
|
||||
*/
|
||||
int m_maxJTGValue_;
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
* UnicodeData.txt property object
|
||||
*/
|
||||
private static UCharacterProperty INSTANCE_ = null;
|
||||
|
||||
/**
|
||||
* Default name of the datafile
|
||||
*/
|
||||
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
|
||||
|
||||
/**
|
||||
* Default buffer size of datafile
|
||||
*/
|
||||
private static final int DATA_BUFFER_SIZE_ = 25000;
|
||||
|
||||
/**
|
||||
* Numeric value shift
|
||||
*/
|
||||
private static final int VALUE_SHIFT_ = 8;
|
||||
|
||||
/**
|
||||
* Mask to be applied after shifting to obtain an unsigned numeric value
|
||||
*/
|
||||
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
|
||||
|
||||
/**
|
||||
* Shift value for lead surrogate to form a supplementary character.
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_SHIFT_ = 10;
|
||||
/**
|
||||
* Offset to add to combined surrogate pair to avoid msking.
|
||||
*/
|
||||
private static final int SURROGATE_OFFSET_ =
|
||||
UTF16.SUPPLEMENTARY_MIN_VALUE -
|
||||
(UTF16.SURROGATE_MIN_VALUE <<
|
||||
LEAD_SURROGATE_SHIFT_) -
|
||||
UTF16.TRAIL_SURROGATE_MIN_VALUE;
|
||||
|
||||
// additional properties ----------------------------------------------
|
||||
|
||||
/**
|
||||
* First nibble shift
|
||||
*/
|
||||
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
|
||||
/**
|
||||
* Second nibble mask
|
||||
*/
|
||||
private static final int LAST_NIBBLE_MASK_ = 0xF;
|
||||
/**
|
||||
* Age value shift
|
||||
*/
|
||||
private static final int AGE_SHIFT_ = 24;
|
||||
|
||||
// private constructors --------------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* @exception IOException thrown when data reading fails or data corrupted
|
||||
*/
|
||||
private UCharacterProperty() throws IOException
|
||||
{
|
||||
// jar access
|
||||
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
|
||||
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
|
||||
UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
|
||||
reader.read(this);
|
||||
b.close();
|
||||
|
||||
m_trie_.putIndexData(this);
|
||||
}
|
||||
|
||||
public void upropsvec_addPropertyStarts(UnicodeSet set) {
|
||||
/* add the start code point of each same-value range of the properties vectors trie */
|
||||
if(m_additionalColumnsCount_>0) {
|
||||
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
|
||||
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
|
||||
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
|
||||
while(propsVectorsIter.next(propsVectorsResult)){
|
||||
set.add(propsVectorsResult.start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
190
jdkSrc/jdk8/sun/text/normalizer/UCharacterPropertyReader.java
Normal file
190
jdkSrc/jdk8/sun/text/normalizer/UCharacterPropertyReader.java
Normal file
@@ -0,0 +1,190 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>Internal reader class for ICU data file uprops.icu containing
|
||||
* Unicode codepoint data.</p>
|
||||
* <p>This class simply reads uprops.icu, authenticates that it is a valid
|
||||
* ICU data file and split its contents up into blocks of data for use in
|
||||
* <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>.
|
||||
* </p>
|
||||
* <p>uprops.icu which is in big-endian format is jared together with this
|
||||
* package.</p>
|
||||
*
|
||||
* Unicode character properties file format see
|
||||
* (ICU4C)/source/tools/genprops/store.c
|
||||
*
|
||||
* @author Syn Wee Quek
|
||||
* @since release 2.1, February 1st 2002
|
||||
*/
|
||||
final class UCharacterPropertyReader implements ICUBinary.Authenticate
|
||||
{
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
public boolean isDataVersionAcceptable(byte version[])
|
||||
{
|
||||
return version[0] == DATA_FORMAT_VERSION_[0]
|
||||
&& version[2] == DATA_FORMAT_VERSION_[2]
|
||||
&& version[3] == DATA_FORMAT_VERSION_[3];
|
||||
}
|
||||
|
||||
// protected constructor ---------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Protected constructor.</p>
|
||||
* @param inputStream ICU uprop.dat file input stream
|
||||
* @exception IOException throw if data file fails authentication
|
||||
*/
|
||||
protected UCharacterPropertyReader(InputStream inputStream)
|
||||
throws IOException
|
||||
{
|
||||
m_unicodeVersion_ = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
|
||||
this);
|
||||
m_dataInputStream_ = new DataInputStream(inputStream);
|
||||
}
|
||||
|
||||
// protected methods -------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Reads uprops.icu, parse it into blocks of data to be stored in
|
||||
* UCharacterProperty.</P
|
||||
* @param ucharppty UCharacterProperty instance
|
||||
* @exception IOException thrown when data reading fails
|
||||
*/
|
||||
protected void read(UCharacterProperty ucharppty) throws IOException
|
||||
{
|
||||
// read the indexes
|
||||
int count = INDEX_SIZE_;
|
||||
m_propertyOffset_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_exceptionOffset_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_caseOffset_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_additionalOffset_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_additionalVectorsOffset_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_additionalColumnsCount_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_reservedOffset_ = m_dataInputStream_.readInt();
|
||||
count --;
|
||||
m_dataInputStream_.skipBytes(3 << 2);
|
||||
count -= 3;
|
||||
ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt();
|
||||
count --; // 10
|
||||
ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt();
|
||||
count --; // 11
|
||||
m_dataInputStream_.skipBytes(count << 2);
|
||||
|
||||
// read the trie index block
|
||||
// m_props_index_ in terms of ints
|
||||
ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, null);
|
||||
|
||||
// skip the 32 bit properties block
|
||||
int size = m_exceptionOffset_ - m_propertyOffset_;
|
||||
m_dataInputStream_.skipBytes(size * 4);
|
||||
|
||||
// reads the 32 bit exceptions block
|
||||
size = m_caseOffset_ - m_exceptionOffset_;
|
||||
m_dataInputStream_.skipBytes(size * 4);
|
||||
|
||||
// reads the 32 bit case block
|
||||
size = (m_additionalOffset_ - m_caseOffset_) << 1;
|
||||
m_dataInputStream_.skipBytes(size * 2);
|
||||
|
||||
if(m_additionalColumnsCount_ > 0) {
|
||||
// reads the additional property block
|
||||
ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_, null);
|
||||
|
||||
// additional properties
|
||||
size = m_reservedOffset_ - m_additionalVectorsOffset_;
|
||||
ucharppty.m_additionalVectors_ = new int[size];
|
||||
for (int i = 0; i < size; i ++) {
|
||||
ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
m_dataInputStream_.close();
|
||||
ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_;
|
||||
ucharppty.m_unicodeVersion_ = VersionInfo.getInstance(
|
||||
(int)m_unicodeVersion_[0], (int)m_unicodeVersion_[1],
|
||||
(int)m_unicodeVersion_[2], (int)m_unicodeVersion_[3]);
|
||||
}
|
||||
|
||||
// private variables -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Index size
|
||||
*/
|
||||
private static final int INDEX_SIZE_ = 16;
|
||||
|
||||
/**
|
||||
* ICU data file input stream
|
||||
*/
|
||||
private DataInputStream m_dataInputStream_;
|
||||
|
||||
/**
|
||||
* Offset information in the indexes.
|
||||
*/
|
||||
private int m_propertyOffset_;
|
||||
private int m_exceptionOffset_;
|
||||
private int m_caseOffset_;
|
||||
private int m_additionalOffset_;
|
||||
private int m_additionalVectorsOffset_;
|
||||
private int m_additionalColumnsCount_;
|
||||
private int m_reservedOffset_;
|
||||
private byte m_unicodeVersion_[];
|
||||
|
||||
/**
|
||||
* Data format "UPro".
|
||||
*/
|
||||
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50,
|
||||
(byte)0x72, (byte)0x6F};
|
||||
/**
|
||||
* Format version; this code works with all versions with the same major
|
||||
* version number and the same Trie bit distribution.
|
||||
*/
|
||||
private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x5, (byte)0,
|
||||
(byte)Trie.INDEX_STAGE_1_SHIFT_,
|
||||
(byte)Trie.INDEX_STAGE_2_SHIFT_};
|
||||
}
|
||||
538
jdkSrc/jdk8/sun/text/normalizer/UTF16.java
Normal file
538
jdkSrc/jdk8/sun/text/normalizer/UTF16.java
Normal file
@@ -0,0 +1,538 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* <p>Standalone utility class providing UTF16 character conversions and
|
||||
* indexing conversions.</p>
|
||||
* <p>Code that uses strings alone rarely need modification.
|
||||
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
|
||||
* operation. Similarly, concatenation is always safe. Substringing is safe if
|
||||
* the start and end are both on UTF-32 boundaries. In normal code, the values
|
||||
* for start and end are on those boundaries, since they arose from operations
|
||||
* like searching. If not, the nearest UTF-32 boundaries can be determined
|
||||
* using <code>bounds()</code>.</p>
|
||||
* <strong>Examples:</strong>
|
||||
* <p>The following examples illustrate use of some of these methods.
|
||||
* <pre>
|
||||
* // iteration forwards: Original
|
||||
* for (int i = 0; i < s.length(); ++i) {
|
||||
* char ch = s.charAt(i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration forwards: Changes for UTF-32
|
||||
* int ch;
|
||||
* for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) {
|
||||
* ch = UTF16.charAt(s,i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration backwards: Original
|
||||
* for (int i = s.length() -1; i >= 0; --i) {
|
||||
* char ch = s.charAt(i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
*
|
||||
* // iteration backwards: Changes for UTF-32
|
||||
* int ch;
|
||||
* for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
|
||||
* ch = UTF16.charAt(s,i);
|
||||
* doSomethingWith(ch);
|
||||
* }
|
||||
* </pre>
|
||||
* <strong>Notes:</strong>
|
||||
* <ul>
|
||||
* <li>
|
||||
* <strong>Naming:</strong> For clarity, High and Low surrogates are called
|
||||
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
|
||||
* sense of their ordering in a string. <code>offset16</code> and
|
||||
* <code>offset32</code> are used to distinguish offsets to UTF-16
|
||||
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
|
||||
* used to contain UTF-32 characters, as opposed to <code>char16</code>,
|
||||
* which is a UTF-16 code unit.
|
||||
* </li>
|
||||
* <li>
|
||||
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
|
||||
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
|
||||
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
|
||||
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
|
||||
* </li>
|
||||
* <li>
|
||||
* <strong>Exceptions:</strong> The error checking will throw an exception
|
||||
* if indices are out of bounds. Other than than that, all methods will
|
||||
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
|
||||
* values are present. <code>UCharacter.isLegal()</code> can be used to check
|
||||
* for validity if desired.
|
||||
* </li>
|
||||
* <li>
|
||||
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
|
||||
* surrogates, then these are counted as one UTF-32 value. This matches
|
||||
* their iteration behavior, which is vital. It also matches common display
|
||||
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
|
||||
* </li>
|
||||
* <li>
|
||||
* <strong>Optimization:</strong> The method implementations may need
|
||||
* optimization if the compiler doesn't fold static final methods. Since
|
||||
* surrogate pairs will form an exceeding small percentage of all the text
|
||||
* in the world, the singleton case should always be optimized for.
|
||||
* </li>
|
||||
* </ul>
|
||||
* @author Mark Davis, with help from Markus Scherer
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
|
||||
public final class UTF16
|
||||
{
|
||||
// public variables ---------------------------------------------------
|
||||
|
||||
/**
|
||||
* The lowest Unicode code point value.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CODEPOINT_MIN_VALUE = 0;
|
||||
/**
|
||||
* The highest Unicode code point value (scalar value) according to the
|
||||
* Unicode Standard.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
|
||||
/**
|
||||
* The minimum value for Supplementary code points
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
|
||||
/**
|
||||
* Lead surrogate minimum value
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
|
||||
/**
|
||||
* Trail surrogate minimum value
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
|
||||
/**
|
||||
* Lead surrogate maximum value
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
|
||||
/**
|
||||
* Trail surrogate maximum value
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
|
||||
/**
|
||||
* Surrogate minimum value
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
|
||||
|
||||
// public method ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a string.
|
||||
* Used when iterating forwards or backwards (with
|
||||
* <code>UTF16.getCharCount()</code>, as well as random access. If a
|
||||
* validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">
|
||||
* UCharacter.isLegal()</a></code> on the return value.
|
||||
* If the char retrieved is part of a surrogate pair, its supplementary
|
||||
* character will be returned. If a complete supplementary character is
|
||||
* not found the incomplete character will be returned
|
||||
* @param source array of UTF-16 chars
|
||||
* @param offset16 UTF-16 offset to the start of the character.
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at
|
||||
* offset16. The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException thrown if offset16 is out of
|
||||
* bounds.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(String source, int offset16) {
|
||||
char single = source.charAt(offset16);
|
||||
if (single < LEAD_SURROGATE_MIN_VALUE) {
|
||||
return single;
|
||||
}
|
||||
return _charAt(source, offset16, single);
|
||||
}
|
||||
|
||||
private static int _charAt(String source, int offset16, char single) {
|
||||
if (single > TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
|
||||
if (single <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
++offset16;
|
||||
if (source.length() != offset16) {
|
||||
char trail = source.charAt(offset16);
|
||||
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
--offset16;
|
||||
if (offset16 >= 0) {
|
||||
// single is a trail surrogate so
|
||||
char lead = source.charAt(offset16);
|
||||
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
}
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a single UTF-32 value from a substring.
|
||||
* Used when iterating forwards or backwards (with
|
||||
* <code>UTF16.getCharCount()</code>, as well as random access. If a
|
||||
* validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
|
||||
* </a></code> on the return value.
|
||||
* If the char retrieved is part of a surrogate pair, its supplementary
|
||||
* character will be returned. If a complete supplementary character is
|
||||
* not found the incomplete character will be returned
|
||||
* @param source array of UTF-16 chars
|
||||
* @param start offset to substring in the source array for analyzing
|
||||
* @param limit offset to substring in the source array for analyzing
|
||||
* @param offset16 UTF-16 offset relative to start
|
||||
* @return UTF-32 value for the UTF-32 value that contains the char at
|
||||
* offset16. The boundaries of that codepoint are the same as in
|
||||
* <code>bounds32()</code>.
|
||||
* @exception IndexOutOfBoundsException thrown if offset16 is not within
|
||||
* the range of start and limit.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int charAt(char source[], int start, int limit,
|
||||
int offset16)
|
||||
{
|
||||
offset16 += start;
|
||||
if (offset16 < start || offset16 >= limit) {
|
||||
throw new ArrayIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
|
||||
char single = source[offset16];
|
||||
if (!isSurrogate(single)) {
|
||||
return single;
|
||||
}
|
||||
|
||||
// Convert the UTF-16 surrogate pair if necessary.
|
||||
// For simplicity in usage, and because the frequency of pairs is
|
||||
// low, look both directions.
|
||||
if (single <= LEAD_SURROGATE_MAX_VALUE) {
|
||||
offset16 ++;
|
||||
if (offset16 >= limit) {
|
||||
return single;
|
||||
}
|
||||
char trail = source[offset16];
|
||||
if (isTrailSurrogate(trail)) {
|
||||
return UCharacterProperty.getRawSupplementary(single, trail);
|
||||
}
|
||||
}
|
||||
else { // isTrailSurrogate(single), so
|
||||
if (offset16 == start) {
|
||||
return single;
|
||||
}
|
||||
offset16 --;
|
||||
char lead = source[offset16];
|
||||
if (isLeadSurrogate(lead))
|
||||
return UCharacterProperty.getRawSupplementary(lead, single);
|
||||
}
|
||||
return single; // return unmatched surrogate
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines how many chars this char32 requires.
|
||||
* If a validity check is required, use <code>
|
||||
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
|
||||
* char32 before calling.
|
||||
* @param char32 the input codepoint.
|
||||
* @return 2 if is in supplementary space, otherwise 1.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int getCharCount(int char32)
|
||||
{
|
||||
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
|
||||
return 1;
|
||||
}
|
||||
return 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code value is a surrogate.
|
||||
* @param char16 the input character.
|
||||
* @return true iff the input character is a surrogate.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isSurrogate(char char16)
|
||||
{
|
||||
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
|
||||
char16 <= TRAIL_SURROGATE_MAX_VALUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a trail surrogate.
|
||||
* @param char16 the input character.
|
||||
* @return true iff the input character is a trail surrogate.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isTrailSurrogate(char char16)
|
||||
{
|
||||
return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
|
||||
char16 <= TRAIL_SURROGATE_MAX_VALUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a lead surrogate.
|
||||
* @param char16 the input character.
|
||||
* @return true iff the input character is a lead surrogate
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static boolean isLeadSurrogate(char char16)
|
||||
{
|
||||
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
|
||||
char16 <= LEAD_SURROGATE_MAX_VALUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the lead surrogate.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
|
||||
* on char32 before calling.
|
||||
* @param char32 the input character.
|
||||
* @return lead surrogate if the getCharCount(ch) is 2; <br>
|
||||
* and 0 otherwise (note: 0 is not a valid lead surrogate).
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static char getLeadSurrogate(int char32)
|
||||
{
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
return (char)(LEAD_SURROGATE_OFFSET_ +
|
||||
(char32 >> LEAD_SURROGATE_SHIFT_));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the trail surrogate.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
|
||||
* on char32 before calling.
|
||||
* @param char32 the input character.
|
||||
* @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
|
||||
* the character itself
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static char getTrailSurrogate(int char32)
|
||||
{
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
|
||||
return (char)(TRAIL_SURROGATE_MIN_VALUE +
|
||||
(char32 & TRAIL_SURROGATE_MASK_));
|
||||
}
|
||||
|
||||
return (char)char32;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method corresponding to String.valueOf(char). Returns a one
|
||||
* or two char string containing the UTF-32 value in UTF16 format. If a
|
||||
* validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
|
||||
* on char32 before calling.
|
||||
* @param char32 the input character.
|
||||
* @return string value of char32 in UTF16 format
|
||||
* @exception IllegalArgumentException thrown if char32 is a invalid
|
||||
* codepoint.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static String valueOf(int char32)
|
||||
{
|
||||
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal codepoint");
|
||||
}
|
||||
return toString(char32);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a single UTF-32 value to the end of a StringBuffer.
|
||||
* If a validity check is required, use
|
||||
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
|
||||
* on char32 before calling.
|
||||
* @param target the buffer to append to
|
||||
* @param char32 value to append.
|
||||
* @return the updated StringBuffer
|
||||
* @exception IllegalArgumentException thrown when char32 does not lie
|
||||
* within the range of the Unicode codepoints
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static StringBuffer append(StringBuffer target, int char32)
|
||||
{
|
||||
// Check for irregular values
|
||||
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
|
||||
}
|
||||
|
||||
// Write the UTF-16 values
|
||||
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
|
||||
{
|
||||
target.append(getLeadSurrogate(char32));
|
||||
target.append(getTrailSurrogate(char32));
|
||||
}
|
||||
else {
|
||||
target.append((char)char32);
|
||||
}
|
||||
return target;
|
||||
}
|
||||
|
||||
//// for StringPrep
|
||||
/**
|
||||
* Shifts offset16 by the argument number of codepoints within a subarray.
|
||||
* @param source char array
|
||||
* @param start position of the subarray to be performed on
|
||||
* @param limit position of the subarray to be performed on
|
||||
* @param offset16 UTF16 position to shift relative to start
|
||||
* @param shift32 number of codepoints to shift
|
||||
* @return new shifted offset16 relative to start
|
||||
* @exception IndexOutOfBoundsException if the new offset16 is out of
|
||||
* bounds with respect to the subarray or the subarray bounds
|
||||
* are out of range.
|
||||
* @stable ICU 2.1
|
||||
*/
|
||||
public static int moveCodePointOffset(char source[], int start, int limit,
|
||||
int offset16, int shift32)
|
||||
{
|
||||
int size = source.length;
|
||||
int count;
|
||||
char ch;
|
||||
int result = offset16 + start;
|
||||
if (start<0 || limit<start) {
|
||||
throw new StringIndexOutOfBoundsException(start);
|
||||
}
|
||||
if (limit>size) {
|
||||
throw new StringIndexOutOfBoundsException(limit);
|
||||
}
|
||||
if (offset16<0 || result>limit) {
|
||||
throw new StringIndexOutOfBoundsException(offset16);
|
||||
}
|
||||
if (shift32 > 0 ) {
|
||||
if (shift32 + result > size) {
|
||||
throw new StringIndexOutOfBoundsException(result);
|
||||
}
|
||||
count = shift32;
|
||||
while (result < limit && count > 0)
|
||||
{
|
||||
ch = source[result];
|
||||
if (isLeadSurrogate(ch) && (result+1 < limit) &&
|
||||
isTrailSurrogate(source[result+1])) {
|
||||
result ++;
|
||||
}
|
||||
count --;
|
||||
result ++;
|
||||
}
|
||||
} else {
|
||||
if (result + shift32 < start) {
|
||||
throw new StringIndexOutOfBoundsException(result);
|
||||
}
|
||||
for (count=-shift32; count>0; count--) {
|
||||
result--;
|
||||
if (result<start) {
|
||||
break;
|
||||
}
|
||||
ch = source[result];
|
||||
if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
|
||||
result--;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (count != 0) {
|
||||
throw new StringIndexOutOfBoundsException(shift32);
|
||||
}
|
||||
result -= start;
|
||||
return result;
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
/**
|
||||
* Shift value for lead surrogate to form a supplementary character.
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_SHIFT_ = 10;
|
||||
|
||||
/**
|
||||
* Mask to retrieve the significant value from a trail surrogate.
|
||||
*/
|
||||
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
|
||||
|
||||
/**
|
||||
* Value that all lead surrogate starts with
|
||||
*/
|
||||
private static final int LEAD_SURROGATE_OFFSET_ =
|
||||
LEAD_SURROGATE_MIN_VALUE -
|
||||
(SUPPLEMENTARY_MIN_VALUE
|
||||
>> LEAD_SURROGATE_SHIFT_);
|
||||
|
||||
// private methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* <p>Converts argument code point and returns a String object representing
|
||||
* the code point's value in UTF16 format.</p>
|
||||
* <p>This method does not check for the validity of the codepoint, the
|
||||
* results are not guaranteed if a invalid codepoint is passed as
|
||||
* argument.</p>
|
||||
* <p>The result is a string whose length is 1 for non-supplementary code
|
||||
* points, 2 otherwise.</p>
|
||||
* @param ch code point
|
||||
* @return string representation of the code point
|
||||
*/
|
||||
private static String toString(int ch)
|
||||
{
|
||||
if (ch < SUPPLEMENTARY_MIN_VALUE) {
|
||||
return String.valueOf((char)ch);
|
||||
}
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
result.append(getLeadSurrogate(ch));
|
||||
result.append(getTrailSurrogate(ch));
|
||||
return result.toString();
|
||||
}
|
||||
}
|
||||
58
jdkSrc/jdk8/sun/text/normalizer/UnicodeMatcher.java
Normal file
58
jdkSrc/jdk8/sun/text/normalizer/UnicodeMatcher.java
Normal file
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
/**
|
||||
* <code>UnicodeMatcher</code> defines a protocol for objects that can
|
||||
* match a range of characters in a Replaceable string.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public interface UnicodeMatcher {
|
||||
|
||||
/**
|
||||
* The character at index i, where i < contextStart || i >= contextLimit,
|
||||
* is ETHER. This allows explicit matching by rules and UnicodeSets
|
||||
* of text outside the context. In traditional terms, this allows anchoring
|
||||
* at the start and/or end.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
static final char ETHER = '\uFFFF';
|
||||
|
||||
}
|
||||
|
||||
//eof
|
||||
1869
jdkSrc/jdk8/sun/text/normalizer/UnicodeSet.java
Normal file
1869
jdkSrc/jdk8/sun/text/normalizer/UnicodeSet.java
Normal file
File diff suppressed because it is too large
Load Diff
219
jdkSrc/jdk8/sun/text/normalizer/UnicodeSetIterator.java
Normal file
219
jdkSrc/jdk8/sun/text/normalizer/UnicodeSetIterator.java
Normal file
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
|
||||
* iterates over either code points or code point ranges. After all
|
||||
* code points or ranges have been returned, it returns the
|
||||
* multicharacter strings of the UnicodSet, if any.
|
||||
*
|
||||
* <p>To iterate over code points, use a loop like this:
|
||||
* <pre>
|
||||
* UnicodeSetIterator it(set);
|
||||
* while (set.next()) {
|
||||
* if (set.codepoint != UnicodeSetIterator::IS_STRING) {
|
||||
* processCodepoint(set.codepoint);
|
||||
* } else {
|
||||
* processString(set.string);
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p>To iterate over code point ranges, use a loop like this:
|
||||
* <pre>
|
||||
* UnicodeSetIterator it(set);
|
||||
* while (set.nextRange()) {
|
||||
* if (set.codepoint != UnicodeSetIterator::IS_STRING) {
|
||||
* processCodepointRange(set.codepoint, set.codepointEnd);
|
||||
* } else {
|
||||
* processString(set.string);
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* @author M. Davis
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class UnicodeSetIterator {
|
||||
|
||||
/**
|
||||
* Value of <tt>codepoint</tt> if the iterator points to a string.
|
||||
* If <tt>codepoint == IS_STRING</tt>, then examine
|
||||
* <tt>string</tt> for the current iteration result.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public static int IS_STRING = -1;
|
||||
|
||||
/**
|
||||
* Current code point, or the special value <tt>IS_STRING</tt>, if
|
||||
* the iterator points to a string.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int codepoint;
|
||||
|
||||
/**
|
||||
* When iterating over ranges using <tt>nextRange()</tt>,
|
||||
* <tt>codepointEnd</tt> contains the inclusive end of the
|
||||
* iteration range, if <tt>codepoint != IS_STRING</tt>. If
|
||||
* iterating over code points using <tt>next()</tt>, or if
|
||||
* <tt>codepoint == IS_STRING</tt>, then the value of
|
||||
* <tt>codepointEnd</tt> is undefined.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int codepointEnd;
|
||||
|
||||
/**
|
||||
* If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points
|
||||
* to the current string. If <tt>codepoint != IS_STRING</tt>, the
|
||||
* value of <tt>string</tt> is undefined.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public String string;
|
||||
|
||||
/**
|
||||
* Create an iterator over the given set.
|
||||
* @param set set to iterate over
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public UnicodeSetIterator(UnicodeSet set) {
|
||||
reset(set);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next element in the set, either a code point range
|
||||
* or a string. If there are no more elements in the set, return
|
||||
* false. If <tt>codepoint == IS_STRING</tt>, the value is a
|
||||
* string in the <tt>string</tt> field. Otherwise the value is a
|
||||
* range of one or more code points from <tt>codepoint</tt> to
|
||||
* <tt>codepointeEnd</tt> inclusive.
|
||||
*
|
||||
* <p>The order of iteration is all code points ranges in sorted
|
||||
* order, followed by all strings sorted order. Ranges are
|
||||
* disjoint and non-contiguous. <tt>string</tt> is undefined
|
||||
* unless <tt>codepoint == IS_STRING</tt>. Do not mix calls to
|
||||
* <tt>next()</tt> and <tt>nextRange()</tt> without calling
|
||||
* <tt>reset()</tt> between them. The results of doing so are
|
||||
* undefined.
|
||||
*
|
||||
* @return true if there was another element in the set and this
|
||||
* object contains the element.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public boolean nextRange() {
|
||||
if (nextElement <= endElement) {
|
||||
codepointEnd = endElement;
|
||||
codepoint = nextElement;
|
||||
nextElement = endElement+1;
|
||||
return true;
|
||||
}
|
||||
if (range < endRange) {
|
||||
loadRange(++range);
|
||||
codepointEnd = endElement;
|
||||
codepoint = nextElement;
|
||||
nextElement = endElement+1;
|
||||
return true;
|
||||
}
|
||||
|
||||
// stringIterator == null iff there are no string elements remaining
|
||||
|
||||
if (stringIterator == null) return false;
|
||||
codepoint = IS_STRING; // signal that value is actually a string
|
||||
string = stringIterator.next();
|
||||
if (!stringIterator.hasNext()) stringIterator = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets this iterator to visit the elements of the given set and
|
||||
* resets it to the start of that set. The iterator is valid only
|
||||
* so long as <tt>set</tt> is valid.
|
||||
* @param set the set to iterate over.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public void reset(UnicodeSet uset) {
|
||||
set = uset;
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets this iterator to the start of the set.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public void reset() {
|
||||
endRange = set.getRangeCount() - 1;
|
||||
range = 0;
|
||||
endElement = -1;
|
||||
nextElement = 0;
|
||||
if (endRange >= 0) {
|
||||
loadRange(range);
|
||||
}
|
||||
stringIterator = null;
|
||||
if (set.strings != null) {
|
||||
stringIterator = set.strings.iterator();
|
||||
if (!stringIterator.hasNext()) stringIterator = null;
|
||||
}
|
||||
}
|
||||
|
||||
// ======================= PRIVATES ===========================
|
||||
|
||||
private UnicodeSet set;
|
||||
private int endRange = 0;
|
||||
private int range = 0;
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
protected int endElement;
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
protected int nextElement;
|
||||
private Iterator<String> stringIterator = null;
|
||||
|
||||
/**
|
||||
* Invariant: stringIterator is null when there are no (more) strings remaining
|
||||
*/
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
protected void loadRange(int aRange) {
|
||||
nextElement = set.getRangeStart(aRange);
|
||||
endElement = set.getRangeEnd(aRange);
|
||||
}
|
||||
}
|
||||
385
jdkSrc/jdk8/sun/text/normalizer/Utility.java
Normal file
385
jdkSrc/jdk8/sun/text/normalizer/Utility.java
Normal file
@@ -0,0 +1,385 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
public final class Utility {
|
||||
|
||||
/**
|
||||
* Convenience utility to compare two Object[]s
|
||||
* Ought to be in System.
|
||||
* @param len the length to compare.
|
||||
* The start indices and start+len must be valid.
|
||||
*/
|
||||
public final static boolean arrayRegionMatches(char[] source, int sourceStart,
|
||||
char[] target, int targetStart,
|
||||
int len)
|
||||
{
|
||||
int sourceEnd = sourceStart + len;
|
||||
int delta = targetStart - sourceStart;
|
||||
for (int i = sourceStart; i < sourceEnd; i++) {
|
||||
if (source[i]!=target[i + delta])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert characters outside the range U+0020 to U+007F to
|
||||
* Unicode escapes, and convert backslash to a double backslash.
|
||||
*/
|
||||
public static final String escape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=0; i<s.length(); ) {
|
||||
int c = UTF16.charAt(s, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (c >= ' ' && c <= 0x007F) {
|
||||
if (c == '\\') {
|
||||
buf.append("\\\\"); // That is, "\\"
|
||||
} else {
|
||||
buf.append((char)c);
|
||||
}
|
||||
} else {
|
||||
boolean four = c <= 0xFFFF;
|
||||
buf.append(four ? "\\u" : "\\U");
|
||||
hex(c, four ? 4 : 8, buf);
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
|
||||
static private final char[] UNESCAPE_MAP = {
|
||||
/*" 0x22, 0x22 */
|
||||
/*' 0x27, 0x27 */
|
||||
/*? 0x3F, 0x3F */
|
||||
/*\ 0x5C, 0x5C */
|
||||
/*a*/ 0x61, 0x07,
|
||||
/*b*/ 0x62, 0x08,
|
||||
/*e*/ 0x65, 0x1b,
|
||||
/*f*/ 0x66, 0x0c,
|
||||
/*n*/ 0x6E, 0x0a,
|
||||
/*r*/ 0x72, 0x0d,
|
||||
/*t*/ 0x74, 0x09,
|
||||
/*v*/ 0x76, 0x0b
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert an escape to a 32-bit code point value. We attempt
|
||||
* to parallel the icu4c unescapeAt() function.
|
||||
* @param offset16 an array containing offset to the character
|
||||
* <em>after</em> the backslash. Upon return offset16[0] will
|
||||
* be updated to point after the escape sequence.
|
||||
* @return character value from 0 to 10FFFF, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAt(String s, int[] offset16) {
|
||||
int c;
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
int minDig = 0;
|
||||
int maxDig = 0;
|
||||
int bitsPerDigit = 4;
|
||||
int dig;
|
||||
int i;
|
||||
boolean braces = false;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int offset = offset16[0];
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset += UTF16.getCharCount(c);
|
||||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
case 'u':
|
||||
minDig = maxDig = 4;
|
||||
break;
|
||||
case 'U':
|
||||
minDig = maxDig = 8;
|
||||
break;
|
||||
case 'x':
|
||||
minDig = 1;
|
||||
if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
|
||||
++offset;
|
||||
braces = true;
|
||||
maxDig = 8;
|
||||
} else {
|
||||
maxDig = 2;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
dig = UCharacter.digit(c, 8);
|
||||
if (dig >= 0) {
|
||||
minDig = 1;
|
||||
maxDig = 3;
|
||||
n = 1; /* Already have first octal digit */
|
||||
bitsPerDigit = 3;
|
||||
result = dig;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (minDig != 0) {
|
||||
while (offset < length && n < maxDig) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
result = (result << bitsPerDigit) | dig;
|
||||
offset += UTF16.getCharCount(c);
|
||||
++n;
|
||||
}
|
||||
if (n < minDig) {
|
||||
return -1;
|
||||
}
|
||||
if (braces) {
|
||||
if (c != 0x7D /*}*/) {
|
||||
return -1;
|
||||
}
|
||||
++offset;
|
||||
}
|
||||
if (result < 0 || result >= 0x110000) {
|
||||
return -1;
|
||||
}
|
||||
// If an escape sequence specifies a lead surrogate, see
|
||||
// if there is a trail surrogate after it, either as an
|
||||
// escape or as a literal. If so, join them up into a
|
||||
// supplementary.
|
||||
if (offset < length &&
|
||||
UTF16.isLeadSurrogate((char) result)) {
|
||||
int ahead = offset+1;
|
||||
c = s.charAt(offset); // [sic] get 16-bit code unit
|
||||
if (c == '\\' && ahead < length) {
|
||||
int o[] = new int[] { ahead };
|
||||
c = unescapeAt(s, o);
|
||||
ahead = o[0];
|
||||
}
|
||||
if (UTF16.isTrailSurrogate((char) c)) {
|
||||
offset = ahead;
|
||||
result = UCharacterProperty.getRawSupplementary(
|
||||
(char) result, (char) c);
|
||||
}
|
||||
}
|
||||
offset16[0] = offset;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
offset16[0] = offset;
|
||||
return UNESCAPE_MAP[i+1];
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Map \cX to control-X: X & 0x1F */
|
||||
if (c == 'c' && offset < length) {
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset16[0] = offset + UTF16.getCharCount(c);
|
||||
return 0x1F & c;
|
||||
}
|
||||
|
||||
/* If no special forms are recognized, then consider
|
||||
* the backslash to generically escape the next character. */
|
||||
offset16[0] = offset;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a integer to size width hex uppercase digits.
|
||||
* E.g., hex('a', 4, str) => "0041".
|
||||
* Append the output to the given StringBuffer.
|
||||
* If width is too small to fit, nothing will be appended to output.
|
||||
*/
|
||||
public static StringBuffer hex(int ch, int width, StringBuffer output) {
|
||||
return appendNumber(output, ch, 16, width);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a integer to size width (minimum) hex uppercase digits.
|
||||
* E.g., hex('a', 4, str) => "0041". If the integer requires more
|
||||
* than width digits, more will be used.
|
||||
*/
|
||||
public static String hex(int ch, int width) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
return appendNumber(buf, ch, 16, width).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over a sequence of zero or more white space characters
|
||||
* at pos. Return the index of the first non-white-space character
|
||||
* at or after pos, or str.length(), if there is none.
|
||||
*/
|
||||
public static int skipWhitespace(String str, int pos) {
|
||||
while (pos < str.length()) {
|
||||
int c = UTF16.charAt(str, pos);
|
||||
if (!UCharacterProperty.isRuleWhiteSpace(c)) {
|
||||
break;
|
||||
}
|
||||
pos += UTF16.getCharCount(c);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static final char DIGITS[] = {
|
||||
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
||||
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
|
||||
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
|
||||
'U', 'V', 'W', 'X', 'Y', 'Z'
|
||||
};
|
||||
|
||||
/**
|
||||
* Append the digits of a positive integer to the given
|
||||
* <code>StringBuffer</code> in the given radix. This is
|
||||
* done recursively since it is easiest to generate the low-
|
||||
* order digit first, but it must be appended last.
|
||||
*
|
||||
* @param result is the <code>StringBuffer</code> to append to
|
||||
* @param n is the positive integer
|
||||
* @param radix is the radix, from 2 to 36 inclusive
|
||||
* @param minDigits is the minimum number of digits to append.
|
||||
*/
|
||||
private static void recursiveAppendNumber(StringBuffer result, int n,
|
||||
int radix, int minDigits)
|
||||
{
|
||||
int digit = n % radix;
|
||||
|
||||
if (n >= radix || minDigits > 1) {
|
||||
recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
|
||||
}
|
||||
|
||||
result.append(DIGITS[digit]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a number to the given StringBuffer in the given radix.
|
||||
* Standard digits '0'-'9' are used and letters 'A'-'Z' for
|
||||
* radices 11 through 36.
|
||||
* @param result the digits of the number are appended here
|
||||
* @param n the number to be converted to digits; may be negative.
|
||||
* If negative, a '-' is prepended to the digits.
|
||||
* @param radix a radix from 2 to 36 inclusive.
|
||||
* @param minDigits the minimum number of digits, not including
|
||||
* any '-', to produce. Values less than 2 have no effect. One
|
||||
* digit is always emitted regardless of this parameter.
|
||||
* @return a reference to result
|
||||
*/
|
||||
public static StringBuffer appendNumber(StringBuffer result, int n,
|
||||
int radix, int minDigits)
|
||||
throws IllegalArgumentException
|
||||
{
|
||||
if (radix < 2 || radix > 36) {
|
||||
throw new IllegalArgumentException("Illegal radix " + radix);
|
||||
}
|
||||
|
||||
|
||||
int abs = n;
|
||||
|
||||
if (n < 0) {
|
||||
abs = -n;
|
||||
result.append("-");
|
||||
}
|
||||
|
||||
recursiveAppendNumber(result, abs, radix, minDigits);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the character is NOT printable ASCII. The tab,
|
||||
* newline and linefeed characters are considered unprintable.
|
||||
*/
|
||||
public static boolean isUnprintable(int c) {
|
||||
return !(c >= 0x20 && c <= 0x7E);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escape unprintable characters using <backslash>uxxxx notation
|
||||
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
|
||||
* above. If the character is printable ASCII, then do nothing
|
||||
* and return FALSE. Otherwise, append the escaped notation and
|
||||
* return TRUE.
|
||||
*/
|
||||
public static boolean escapeUnprintable(StringBuffer result, int c) {
|
||||
if (isUnprintable(c)) {
|
||||
result.append('\\');
|
||||
if ((c & ~0xFFFF) != 0) {
|
||||
result.append('U');
|
||||
result.append(DIGITS[0xF&(c>>28)]);
|
||||
result.append(DIGITS[0xF&(c>>24)]);
|
||||
result.append(DIGITS[0xF&(c>>20)]);
|
||||
result.append(DIGITS[0xF&(c>>16)]);
|
||||
} else {
|
||||
result.append('u');
|
||||
}
|
||||
result.append(DIGITS[0xF&(c>>12)]);
|
||||
result.append(DIGITS[0xF&(c>>8)]);
|
||||
result.append(DIGITS[0xF&(c>>4)]);
|
||||
result.append(DIGITS[0xF&c]);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Similar to StringBuffer.getChars, version 1.3.
|
||||
* Since JDK 1.2 implements StringBuffer.getChars differently, this method
|
||||
* is here to provide consistent results.
|
||||
* To be removed after JDK 1.2 ceased to be the reference platform.
|
||||
* @param src source string buffer
|
||||
* @param srcBegin offset to the start of the src to retrieve from
|
||||
* @param srcEnd offset to the end of the src to retrieve from
|
||||
* @param dst char array to store the retrieved chars
|
||||
* @param dstBegin offset to the start of the destination char array to
|
||||
* store the retrieved chars
|
||||
*/
|
||||
public static void getChars(StringBuffer src, int srcBegin, int srcEnd,
|
||||
char dst[], int dstBegin)
|
||||
{
|
||||
if (srcBegin == srcEnd) {
|
||||
return;
|
||||
}
|
||||
src.getChars(srcBegin, srcEnd, dst, dstBegin);
|
||||
}
|
||||
|
||||
}
|
||||
185
jdkSrc/jdk8/sun/text/normalizer/VersionInfo.java
Normal file
185
jdkSrc/jdk8/sun/text/normalizer/VersionInfo.java
Normal file
@@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
/*
|
||||
*******************************************************************************
|
||||
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
|
||||
* *
|
||||
* The original version of this source code and documentation is copyrighted *
|
||||
* and owned by IBM, These materials are provided under terms of a License *
|
||||
* Agreement between IBM and Sun. This technology is protected by multiple *
|
||||
* US and International patents. This notice and attribution to IBM may not *
|
||||
* to removed. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Class to store version numbers of the form major.minor.milli.micro.
|
||||
* @author synwee
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public final class VersionInfo
|
||||
{
|
||||
|
||||
// public methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns an instance of VersionInfo with the argument version.
|
||||
* @param version version String in the format of "major.minor.milli.micro"
|
||||
* or "major.minor.milli" or "major.minor" or "major",
|
||||
* where major, minor, milli, micro are non-negative numbers
|
||||
* <= 255. If the trailing version numbers are
|
||||
* not specified they are taken as 0s. E.g. Version "3.1" is
|
||||
* equivalent to "3.1.0.0".
|
||||
* @return an instance of VersionInfo with the argument version.
|
||||
* @exception throws an IllegalArgumentException when the argument version
|
||||
* is not in the right format
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static VersionInfo getInstance(String version)
|
||||
{
|
||||
int length = version.length();
|
||||
int array[] = {0, 0, 0, 0};
|
||||
int count = 0;
|
||||
int index = 0;
|
||||
|
||||
while (count < 4 && index < length) {
|
||||
char c = version.charAt(index);
|
||||
if (c == '.') {
|
||||
count ++;
|
||||
}
|
||||
else {
|
||||
c -= '0';
|
||||
if (c < 0 || c > 9) {
|
||||
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
|
||||
}
|
||||
array[count] *= 10;
|
||||
array[count] += c;
|
||||
}
|
||||
index ++;
|
||||
}
|
||||
if (index != length) {
|
||||
throw new IllegalArgumentException(
|
||||
"Invalid version number: String '" + version + "' exceeds version format");
|
||||
}
|
||||
for (int i = 0; i < 4; i ++) {
|
||||
if (array[i] < 0 || array[i] > 255) {
|
||||
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
|
||||
}
|
||||
}
|
||||
|
||||
return getInstance(array[0], array[1], array[2], array[3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance of VersionInfo with the argument version.
|
||||
* @param major major version, non-negative number <= 255.
|
||||
* @param minor minor version, non-negative number <= 255.
|
||||
* @param milli milli version, non-negative number <= 255.
|
||||
* @param micro micro version, non-negative number <= 255.
|
||||
* @exception throws an IllegalArgumentException when either arguments are
|
||||
* negative or > 255
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static VersionInfo getInstance(int major, int minor, int milli,
|
||||
int micro)
|
||||
{
|
||||
// checks if it is in the hashmap
|
||||
// else
|
||||
if (major < 0 || major > 255 || minor < 0 || minor > 255 ||
|
||||
milli < 0 || milli > 255 || micro < 0 || micro > 255) {
|
||||
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
|
||||
}
|
||||
int version = getInt(major, minor, milli, micro);
|
||||
Integer key = Integer.valueOf(version);
|
||||
Object result = MAP_.get(key);
|
||||
if (result == null) {
|
||||
result = new VersionInfo(version);
|
||||
MAP_.put(key, result);
|
||||
}
|
||||
return (VersionInfo)result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares other with this VersionInfo.
|
||||
* @param other VersionInfo to be compared
|
||||
* @return 0 if the argument is a VersionInfo object that has version
|
||||
* information equals to this object.
|
||||
* Less than 0 if the argument is a VersionInfo object that has
|
||||
* version information greater than this object.
|
||||
* Greater than 0 if the argument is a VersionInfo object that
|
||||
* has version information less than this object.
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public int compareTo(VersionInfo other)
|
||||
{
|
||||
return m_version_ - other.m_version_;
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Version number stored as a byte for each of the major, minor, milli and
|
||||
* micro numbers in the 32 bit int.
|
||||
* Most significant for the major and the least significant contains the
|
||||
* micro numbers.
|
||||
*/
|
||||
private int m_version_;
|
||||
/**
|
||||
* Map of singletons
|
||||
*/
|
||||
private static final HashMap<Integer, Object> MAP_ = new HashMap<>();
|
||||
/**
|
||||
* Error statement string
|
||||
*/
|
||||
private static final String INVALID_VERSION_NUMBER_ =
|
||||
"Invalid version number: Version number may be negative or greater than 255";
|
||||
|
||||
// private constructor -----------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructor with int
|
||||
* @param compactversion a 32 bit int with each byte representing a number
|
||||
*/
|
||||
private VersionInfo(int compactversion)
|
||||
{
|
||||
m_version_ = compactversion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the int from the version numbers
|
||||
* @param major non-negative version number
|
||||
* @param minor non-negativeversion number
|
||||
* @param milli non-negativeversion number
|
||||
* @param micro non-negativeversion number
|
||||
*/
|
||||
private static int getInt(int major, int minor, int milli, int micro)
|
||||
{
|
||||
return (major << 24) | (minor << 16) | (milli << 8) | micro;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user