feat(jdk8): move files to new folder to avoid resources compiled.

This commit is contained in:
2025-09-07 15:25:52 +08:00
parent 3f0047bf6f
commit 8c35cfb1c0
17415 changed files with 217 additions and 213 deletions

View File

@@ -0,0 +1,349 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
/**
* Trie implementation which stores data in char, 16 bits.
* @author synwee
* @see com.ibm.icu.impl.Trie
* @since release 2.1, Jan 01 2002
*/
// note that i need to handle the block calculations later, since chartrie
// in icu4c uses the same index array.
public class CharTrie extends Trie
{
// public constructors ---------------------------------------------
/**
* <p>Creates a new Trie with the settings for the trie data.</p>
* <p>Unserialize the 32-bit-aligned input stream and use the data for the
* trie.</p>
* @param inputStream file input stream to a ICU data file, containing
* the trie
* @param dataManipulate object which provides methods to parse the char
* data
* @throws IOException thrown when data reading fails
* @draft 2.1
*/
public CharTrie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
{
super(inputStream, dataManipulate);
if (!isCharTrie()) {
throw new IllegalArgumentException(
"Data given does not belong to a char trie.");
}
m_friendAgent_ = new FriendAgent();
}
/**
* Make a dummy CharTrie.
* A dummy trie is an empty runtime trie, used when a real data trie cannot
* be loaded.
*
* The trie always returns the initialValue,
* or the leadUnitValue for lead surrogate code points.
* The Latin-1 part is always set up to be linear.
*
* @param initialValue the initial value that is set for all code points
* @param leadUnitValue the value for lead surrogate code _units_ that do not
* have associated supplementary data
* @param dataManipulate object which provides methods to parse the char data
*/
public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
int dataLength, latin1Length, i, limit;
char block;
/* calculate the actual size of the dummy trie data */
/* max(Latin-1, block 0) */
dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
if(leadUnitValue!=initialValue) {
dataLength+=DATA_BLOCK_LENGTH;
}
m_data_=new char[dataLength];
m_dataLength_=dataLength;
m_initialValue_=(char)initialValue;
/* fill the index and data arrays */
/* indexes are preset to 0 (block 0) */
/* Latin-1 data */
for(i=0; i<latin1Length; ++i) {
m_data_[i]=(char)initialValue;
}
if(leadUnitValue!=initialValue) {
/* indexes for lead surrogate code units to the block after Latin-1 */
block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
i=0xd800>>INDEX_STAGE_1_SHIFT_;
limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
for(; i<limit; ++i) {
m_index_[i]=block;
}
/* data for lead surrogate code units */
limit=latin1Length+DATA_BLOCK_LENGTH;
for(i=latin1Length; i<limit; ++i) {
m_data_[i]=(char)leadUnitValue;
}
}
m_friendAgent_ = new FriendAgent();
}
/**
* Java friend implementation
*/
public class FriendAgent
{
/**
* Gives out the index array of the trie
* @return index array of trie
*/
public char[] getPrivateIndex()
{
return m_index_;
}
/**
* Gives out the data array of the trie
* @return data array of trie
*/
public char[] getPrivateData()
{
return m_data_;
}
/**
* Gives out the data offset in the trie
* @return data offset in the trie
*/
public int getPrivateInitialValue()
{
return m_initialValue_;
}
}
// public methods --------------------------------------------------
/**
* Java friend implementation
* To store the index and data array into the argument.
* @param friend java friend UCharacterProperty object to store the array
*/
public void putIndexData(UCharacterProperty friend)
{
friend.setIndexData(m_friendAgent_);
}
/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
* @draft 2.1
*/
public final char getCodePointValue(int ch)
{
int offset;
// fastpath for U+0000..U+D7FF
if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// copy of getRawOffset()
offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
return m_data_[offset];
}
// handle U+D800..U+10FFFF
offset = getCodePointOffset(ch);
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}
/**
* Gets the value to the data which this lead surrogate character points
* to.
* Returned data may contain folding offset information for the next
* trailing surrogate character.
* This method does not guarantee correct results for trail surrogates.
* @param ch lead surrogate character
* @return data value
* @draft 2.1
*/
public final char getLeadValue(char ch)
{
return m_data_[getLeadOffset(ch)];
}
/**
* Get the value associated with a pair of surrogates.
* @param lead a lead surrogate
* @param trail a trail surrogate
* @draft 2.1
*/
public final char getSurrogateValue(char lead, char trail)
{
int offset = getSurrogateOffset(lead, trail);
if (offset > 0) {
return m_data_[offset];
}
return m_initialValue_;
}
/**
* <p>Get a value from a folding offset (from the value of a lead surrogate)
* and a trail surrogate.</p>
* <p>If the
* @param leadvalue value associated with the lead surrogate which contains
* the folding offset
* @param trail surrogate
* @return trie data value associated with the trail character
* @draft 2.1
*/
public final char getTrailValue(int leadvalue, char trail)
{
if (m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
if (offset > 0) {
return m_data_[getRawOffset(offset,
(char)(trail & SURROGATE_MASK_))];
}
return m_initialValue_;
}
// protected methods -----------------------------------------------
/**
* <p>Parses the input stream and stores its trie content into a index and
* data array</p>
* @param inputStream data input stream containing trie data
* @exception IOException thrown when data reading fails
*/
protected final void unserialize(InputStream inputStream)
throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
int indexDataLength = m_dataOffset_ + m_dataLength_;
m_index_ = new char[indexDataLength];
for (int i = 0; i < indexDataLength; i ++) {
m_index_[i] = input.readChar();
}
m_data_ = m_index_;
m_initialValue_ = m_data_[m_dataOffset_];
}
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
* @draft 2.1
*/
protected final int getSurrogateOffset(char lead, char trail)
{
if (m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
// get fold position for the next trail surrogate
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
// get the real data from the folded lead/trail units
if (offset > 0) {
return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
}
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return -1;
}
/**
* Gets the value at the argument index.
* For use internally in TrieIterator.
* @param index value at index will be retrieved
* @return 32 bit value
* @see com.ibm.icu.impl.TrieIterator
* @draft 2.1
*/
protected final int getValue(int index)
{
return m_data_[index];
}
/**
* Gets the default initial value
* @return 32 bit value
* @draft 2.1
*/
protected final int getInitialValue()
{
return m_initialValue_;
}
// private data members --------------------------------------------
/**
* Default value
*/
private char m_initialValue_;
/**
* Array of char data
*/
private char m_data_[];
/**
* Agent for friends
*/
private FriendAgent m_friendAgent_;
}

View File

@@ -0,0 +1,146 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
/**
* This class is a wrapper around CharacterIterator and implements the
* UCharacterIterator protocol
* @author ram
*/
public class CharacterIteratorWrapper extends UCharacterIterator {
private CharacterIterator iterator;
public CharacterIteratorWrapper(CharacterIterator iter){
if(iter==null){
throw new IllegalArgumentException();
}
iterator = iter;
}
/**
* @see UCharacterIterator#current()
*/
public int current() {
int c = iterator.current();
if(c==CharacterIterator.DONE){
return DONE;
}
return c;
}
/**
* @see UCharacterIterator#getLength()
*/
public int getLength() {
return (iterator.getEndIndex() - iterator.getBeginIndex());
}
/**
* @see UCharacterIterator#getIndex()
*/
public int getIndex() {
return iterator.getIndex();
}
/**
* @see UCharacterIterator#next()
*/
public int next() {
int i = iterator.current();
iterator.next();
if(i==CharacterIterator.DONE){
return DONE;
}
return i;
}
/**
* @see UCharacterIterator#previous()
*/
public int previous() {
int i = iterator.previous();
if(i==CharacterIterator.DONE){
return DONE;
}
return i;
}
/**
* @see UCharacterIterator#setIndex(int)
*/
public void setIndex(int index) {
iterator.setIndex(index);
}
//// for StringPrep
/**
* @see UCharacterIterator#getText(char[])
*/
public int getText(char[] fillIn, int offset){
int length =iterator.getEndIndex() - iterator.getBeginIndex();
int currentIndex = iterator.getIndex();
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) {
fillIn[offset++] = ch;
}
iterator.setIndex(currentIndex);
return length;
}
/**
* Creates a clone of this iterator. Clones the underlying character iterator.
* @see UCharacterIterator#clone()
*/
public Object clone(){
try {
CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone();
result.iterator = (CharacterIterator)this.iterator.clone();
return result;
} catch (CloneNotSupportedException e) {
return null; // only invoked if bad underlying character iterator
}
}
}

View File

@@ -0,0 +1,189 @@
/*
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.Arrays;
public final class ICUBinary
{
// public inner interface ------------------------------------------------
/**
* Special interface for data authentication
*/
public static interface Authenticate
{
/**
* Method used in ICUBinary.readHeader() to provide data format
* authentication.
* @param version version of the current data
* @return true if dataformat is an acceptable version, false otherwise
*/
public boolean isDataVersionAcceptable(byte version[]);
}
// public methods --------------------------------------------------------
/**
* <p>ICU data header reader method.
* Takes a ICU generated big-endian input stream, parse the ICU standard
* file header and authenticates them.</p>
* <p>Header format:
* <ul>
* <li> Header size (char)
* <li> Magic number 1 (byte)
* <li> Magic number 2 (byte)
* <li> Rest of the header size (char)
* <li> Reserved word (char)
* <li> Big endian indicator (byte)
* <li> Character set family indicator (byte)
* <li> Size of a char (byte) for c++ and c use
* <li> Reserved byte (byte)
* <li> Data format identifier (4 bytes), each ICU data has its own
* identifier to distinguish them. [0] major [1] minor
* [2] milli [3] micro
* <li> Data version (4 bytes), the change version of the ICU data
* [0] major [1] minor [2] milli [3] micro
* <li> Unicode version (4 bytes) this ICU is based on.
* </ul>
* </p>
* <p>
* Example of use:<br>
* <pre>
* try {
* FileInputStream input = new FileInputStream(filename);
* If (Utility.readICUDataHeader(input, dataformat, dataversion,
* unicode) {
* System.out.println("Verified file header, this is a ICU data file");
* }
* } catch (IOException e) {
* System.out.println("This is not a ICU data file");
* }
* </pre>
* </p>
* @param inputStream input stream that contains the ICU data header
* @param dataFormatIDExpected Data format expected. An array of 4 bytes
* information about the data format.
* E.g. data format ID 1.2.3.4. will became an array of
* {1, 2, 3, 4}
* @param authenticate user defined extra data authentication. This value
* can be null, if no extra authentication is needed.
* @exception IOException thrown if there is a read error or
* when header authentication fails.
* @draft 2.1
*/
public static final byte[] readHeader(InputStream inputStream,
byte dataFormatIDExpected[],
Authenticate authenticate)
throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
char headersize = input.readChar();
int readcount = 2;
//reading the header format
byte magic1 = input.readByte();
readcount ++;
byte magic2 = input.readByte();
readcount ++;
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
}
input.readChar(); // reading size
readcount += 2;
input.readChar(); // reading reserved word
readcount += 2;
byte bigendian = input.readByte();
readcount ++;
byte charset = input.readByte();
readcount ++;
byte charsize = input.readByte();
readcount ++;
input.readByte(); // reading reserved byte
readcount ++;
byte dataFormatID[] = new byte[4];
input.readFully(dataFormatID);
readcount += 4;
byte dataVersion[] = new byte[4];
input.readFully(dataVersion);
readcount += 4;
byte unicodeVersion[] = new byte[4];
input.readFully(unicodeVersion);
readcount += 4;
if (headersize < readcount) {
throw new IOException("Internal Error: Header size error");
}
input.skipBytes(headersize - readcount);
if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_
|| charsize != CHAR_SIZE_
|| !Arrays.equals(dataFormatIDExpected, dataFormatID)
|| (authenticate != null
&& !authenticate.isDataVersionAcceptable(dataVersion))) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
}
return unicodeVersion;
}
// private variables -------------------------------------------------
/**
* Magic numbers to authenticate the data file
*/
private static final byte MAGIC1 = (byte)0xda;
private static final byte MAGIC2 = (byte)0x27;
/**
* File format authentication values
*/
private static final byte BIG_ENDIAN_ = 1;
private static final byte CHAR_SET_ = 0;
private static final byte CHAR_SIZE_ = 2;
/**
* Error messages
*/
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ =
"ICU data file error: Not an ICU data file";
private static final String HEADER_AUTHENTICATION_FAILED_ =
"ICU data file error: Header authentication failed, please check if you have a valid ICU data file";
}

View File

@@ -0,0 +1,83 @@
/*
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.InputStream;
import java.net.URL;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.MissingResourceException;
/**
* Provides access to ICU data files as InputStreams. Implements security checking.
*/
public final class ICUData {
private static InputStream getStream(final Class<ICUData> root, final String resourceName, boolean required) {
InputStream i = null;
if (System.getSecurityManager() != null) {
i = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
public InputStream run() {
return root.getResourceAsStream(resourceName);
}
});
} else {
i = root.getResourceAsStream(resourceName);
}
if (i == null && required) {
throw new MissingResourceException("could not locate data", root.getPackage().getName(), resourceName);
}
return i;
}
/*
* Convenience override that calls getStream(ICUData.class, resourceName, false);
*/
public static InputStream getStream(String resourceName) {
return getStream(ICUData.class, resourceName, false);
}
/*
* Convenience method that calls getStream(ICUData.class, resourceName, true).
*/
public static InputStream getRequiredStream(String resourceName) {
return getStream(ICUData.class, resourceName, true);
}
}

View File

@@ -0,0 +1,229 @@
/*
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.Arrays;
/**
* Trie implementation which stores data in int, 32 bits.
* @author synwee
* @see com.ibm.icu.impl.Trie
* @since release 2.1, Jan 01 2002
*/
public class IntTrie extends Trie
{
// public constructors ---------------------------------------------
/**
* <p>Creates a new Trie with the settings for the trie data.</p>
* <p>Unserialize the 32-bit-aligned input stream and use the data for the
* trie.</p>
* @param inputStream file input stream to a ICU data file, containing
* the trie
* @param dataManipulate object which provides methods to parse the char
* data
* @throws IOException thrown when data reading fails
* @draft 2.1
*/
public IntTrie(InputStream inputStream, DataManipulate datamanipulate)
throws IOException
{
super(inputStream, datamanipulate);
if (!isIntTrie()) {
throw new IllegalArgumentException(
"Data given does not belong to a int trie.");
}
}
// public methods --------------------------------------------------
/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
* @draft 2.1
*/
public final int getCodePointValue(int ch)
{
int offset = getCodePointOffset(ch);
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}
/**
* Gets the value to the data which this lead surrogate character points
* to.
* Returned data may contain folding offset information for the next
* trailing surrogate character.
* This method does not guarantee correct results for trail surrogates.
* @param ch lead surrogate character
* @return data value
* @draft 2.1
*/
public final int getLeadValue(char ch)
{
return m_data_[getLeadOffset(ch)];
}
/**
* Get a value from a folding offset (from the value of a lead surrogate)
* and a trail surrogate.
* @param leadvalue the value of a lead surrogate that contains the
* folding offset
* @param trail surrogate
* @return trie data value associated with the trail character
* @draft 2.1
*/
public final int getTrailValue(int leadvalue, char trail)
{
if (m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
if (offset > 0) {
return m_data_[getRawOffset(offset,
(char)(trail & SURROGATE_MASK_))];
}
return m_initialValue_;
}
// protected methods -----------------------------------------------
/**
* <p>Parses the input stream and stores its trie content into a index and
* data array</p>
* @param inputStream data input stream containing trie data
* @exception IOException thrown when data reading fails
*/
protected final void unserialize(InputStream inputStream)
throws IOException
{
super.unserialize(inputStream);
// one used for initial value
m_data_ = new int[m_dataLength_];
DataInputStream input = new DataInputStream(inputStream);
for (int i = 0; i < m_dataLength_; i ++) {
m_data_[i] = input.readInt();
}
m_initialValue_ = m_data_[0];
}
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
* @draft 2.1
*/
protected final int getSurrogateOffset(char lead, char trail)
{
if (m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
// get fold position for the next trail surrogate
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
// get the real data from the folded lead/trail units
if (offset > 0) {
return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
}
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return -1;
}
/**
* Gets the value at the argument index.
* For use internally in TrieIterator
* @param index value at index will be retrieved
* @return 32 bit value
* @see com.ibm.icu.impl.TrieIterator
* @draft 2.1
*/
protected final int getValue(int index)
{
return m_data_[index];
}
/**
* Gets the default initial value
* @return 32 bit value
* @draft 2.1
*/
protected final int getInitialValue()
{
return m_initialValue_;
}
// package private methods -----------------------------------------
/**
* Internal constructor for builder use
* @param index the index array to be slotted into this trie
* @param data the data array to be slotted into this trie
* @param initialvalue the initial value for this trie
* @param options trie options to use
* @param datamanipulate folding implementation
*/
IntTrie(char index[], int data[], int initialvalue, int options,
DataManipulate datamanipulate)
{
super(index, options, datamanipulate);
m_data_ = data;
m_dataLength_ = m_data_.length;
m_initialValue_ = initialvalue;
}
// private data members --------------------------------------------
/**
* Default value
*/
private int m_initialValue_;
/**
* Array of char data
*/
private int m_data_[];
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,389 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* @author Ram Viswanadha
*/
/*
* Description of the format of unorm.icu version 2.1.
*
* Main change from version 1 to version 2:
* Use of new, common Trie instead of normalization-specific tries.
* Change to version 2.1: add third/auxiliary trie with associated data.
*
* For more details of how to use the data structures see the code
* in unorm.cpp (runtime normalization code) and
* in gennorm.c and gennorm/store.c (build-time data generation).
*
* For the serialized format of Trie see Trie.c/TrieHeader.
*
* - Overall partition
*
* unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
* After that there are the following structures:
*
* char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
*
* Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
*
* char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
* extraData[0] contains the number of units for
* FC_NFKC_Closure (formatVersion>=2.1)
*
* char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
* combiningTableTop may include one 16-bit padding unit
* to make sure that fcdTrie is 32-bit-aligned
*
* Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
*
* Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
*
*
* The indexes array contains lengths and sizes of the following arrays and structures
* as well as the following values:
* indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
* -- one more than the highest combining index computed for forward-only-combining characters
* indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
* -- number of combining indexes computed for both-ways-combining characters
* indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
* -- number of combining indexes computed for backward-only-combining characters
*
* indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
* -- first code point with a quick check NF* value of NO/MAYBE
*
*
* - Tries
*
* The main structures are two Trie tables ("compact arrays"),
* each with one index array and one data array.
* See Trie.h and Trie.c.
*
*
* - Tries in unorm.icu
*
* The first trie (normTrie above)
* provides data for the NF* quick checks and normalization.
* The second trie (fcdTrie above) provides data just for FCD checks.
*
*
* - norm32 data words from the first trie
*
* The norm32Table contains one 32-bit word "norm32" per code point.
* It contains the following bit fields:
* 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
* if this index is <EXTRA_INDEX_TOP then it is an index into
* extraData[] where variable-length normalization data for this
* code point is found
* if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
* then this is a norm32 for a leading surrogate, and the index
* value is used together with the following trailing surrogate
* code unit in the second trie access
* if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
* then this is a norm32 for a "special" character,
* i.e., the character is a Hangul syllable or a Jamo
* see EXTRA_HANGUL etc.
* generally, instead of extracting this index from the norm32 and
* comparing it with the above constants,
* the normalization code compares the entire norm32 value
* with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
*
* 15..8 combining class (cc) according to UnicodeData.txt
*
* 7..6 COMBINES_ANY flags, used in composition to see if a character
* combines with any following or preceding character(s)
* at all
* 7 COMBINES_BACK
* 6 COMBINES_FWD
*
* 5..0 quick check flags, set for "no" or "maybe", with separate flags for
* each normalization form
* the higher bits are "maybe" flags; for NF*D there are no such flags
* the lower bits are "no" flags for all forms, in the same order
* as the "maybe" flags,
* which is (MSB to LSB): NFKD NFD NFKC NFC
* 5..4 QC_ANY_MAYBE
* 3..0 QC_ANY_NO
* see further related constants
*
*
* - Extra data per code point
*
* "Extra data" is referenced by the index in norm32.
* It is variable-length data. It is only present, and only those parts
* of it are, as needed for a given character.
* The norm32 extra data index is added to the beginning of extraData[]
* to get to a vector of 16-bit words with data at the following offsets:
*
* [-1] Combining index for composition.
* Stored only if norm32&COMBINES_ANY .
* [0] Lengths of the canonical and compatibility decomposition strings.
* Stored only if there are decompositions, i.e.,
* if norm32&(QC_NFD|QC_NFKD)
* High byte: length of NFKD, or 0 if none
* Low byte: length of NFD, or 0 if none
* Each length byte also has another flag:
* Bit 7 of a length byte is set if there are non-zero
* combining classes (cc's) associated with the respective
* decomposition. If this flag is set, then the decomposition
* is preceded by a 16-bit word that contains the
* leading and trailing cc's.
* Bits 6..0 of a length byte are the length of the
* decomposition string, not counting the cc word.
* [1..n] NFD
* [n+1..] NFKD
*
* Each of the two decompositions consists of up to two parts:
* - The 16-bit words with the leading and trailing cc's.
* This is only stored if bit 7 of the corresponding length byte
* is set. In this case, at least one of the cc's is not zero.
* High byte: leading cc==cc of the first code point in the decomposition string
* Low byte: trailing cc==cc of the last code point in the decomposition string
* - The decomposition string in UTF-16, with length code units.
*
*
* - Combining indexes and combiningTable[]
*
* Combining indexes are stored at the [-1] offset of the extra data
* if the character combines forward or backward with any other characters.
* They are used for (re)composition in NF*C.
* Values of combining indexes are arranged according to whether a character
* combines forward, backward, or both ways:
* forward-only < both ways < backward-only
*
* The index values for forward-only and both-ways combining characters
* are indexes into the combiningTable[].
* The index values for backward-only combining characters are simply
* incremented from the preceding index values to be unique.
*
* In the combiningTable[], a variable-length list
* of variable-length (back-index, code point) pair entries is stored
* for each forward-combining character.
*
* These back-indexes are the combining indexes of both-ways or backward-only
* combining characters that the forward-combining character combines with.
*
* Each list is sorted in ascending order of back-indexes.
* Each list is terminated with the last back-index having bit 15 set.
*
* Each pair (back-index, code point) takes up either 2 or 3
* 16-bit words.
* The first word of a list entry is the back-index, with its bit 15 set if
* this is the last pair in the list.
*
* The second word contains flags in bits 15..13 that determine
* if there is a third word and how the combined character is encoded:
* 15 set if there is a third word in this list entry
* 14 set if the result is a supplementary character
* 13 set if the result itself combines forward
*
* According to these bits 15..14 of the second word,
* the result character is encoded as follows:
* 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
* the second word.
* 10 The result is 0x2000..0xffff and stored in the third word.
* Bits 12..0 of the second word are not used.
* 11 The result is a supplementary character.
* Bits 9..0 of the leading surrogate are in bits 9..0 of
* the second word.
* Add 0xd800 to these bits to get the complete surrogate.
* Bits 12..10 of the second word are not used.
* The trailing surrogate is stored in the third word.
*
*
* - FCD trie
*
* The FCD trie is very simple.
* It is a folded trie with 16-bit data words.
* In each word, the high byte contains the leading cc of the character,
* and the low byte contains the trailing cc of the character.
* These cc's are the cc's of the first and last code points in the
* canonical decomposition of the character.
*
* Since all 16 bits are used for cc's, lead surrogates must be tested
* by checking the code unit instead of the trie data.
* This is done only if the 16-bit data word is not zero.
* If the code unit is a leading surrogate and the data word is not zero,
* then instead of cc's it contains the offset for the second trie lookup.
*
*
* - Auxiliary trie and data
*
*
* The auxiliary 16-bit trie contains data for additional properties.
* Bits
* 15..13 reserved
* 12 not NFC_Skippable (f) (formatVersion>=2.2)
* 11 flag: not a safe starter for canonical closure
* 10 composition exclusion
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
* (not for lead surrogate),
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
*
* Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
* (used in NormalizerTransliterator)
*
* A skippable character is
* a) unassigned, or ALL of the following:
* b) of combining class 0.
* c) not decomposed by this normalization form.
* AND if NFC or NFKC,
* d) can never compose with a previous character.
* e) can never compose with a following character.
* f) can never change if another character is added.
* Example: a-breve might satisfy all but f, but if you
* add an ogonek it changes to a-ogonek + breve
*
* a)..e) must be tested from norm32.
* Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
* into the auxiliary trie.
* The same bit is used for NFC and NFKC; (c) differs for them.
* As usual, we build the "not skippable" flags so that unassigned
* code points get a 0 bit.
* This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
* Test Hangul LV syllables entirely in code.
*
*
* - FC_NFKC_Closure strings in extraData[]
*
* Strings are either stored as a single code unit or as the length
* followed by that many units.
*
*/
final class NormalizerDataReader implements ICUBinary.Authenticate {
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
protected NormalizerDataReader(InputStream inputStream)
throws IOException{
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
dataInputStream = new DataInputStream(inputStream);
}
// protected methods -------------------------------------------------
protected int[] readIndexes(int length)throws IOException{
int[] indexes = new int[length];
//Read the indexes
for (int i = 0; i <length ; i++) {
indexes[i] = dataInputStream.readInt();
}
return indexes;
}
/**
* <p>Reads unorm.icu, parse it into blocks of data to be stored in
* NormalizerImpl.</P
* @param normBytes
* @param fcdBytes
* @param auxBytes
* @param extraData
* @param combiningTable
* @exception thrown when data reading fails
* @draft 2.1
*/
protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
char[] extraData, char[] combiningTable)
throws IOException{
//Read the bytes that make up the normTrie
dataInputStream.readFully(normBytes);
//normTrieStream= new ByteArrayInputStream(normBytes);
//Read the extra data
for(int i=0;i<extraData.length;i++){
extraData[i]=dataInputStream.readChar();
}
//Read the combining class table
for(int i=0; i<combiningTable.length; i++){
combiningTable[i]=dataInputStream.readChar();
}
//Read the fcdTrie
dataInputStream.readFully(fcdBytes);
//Read the AuxTrie
dataInputStream.readFully(auxBytes);
}
public byte[] getDataFormatVersion(){
return DATA_FORMAT_VERSION;
}
public boolean isDataVersionAcceptable(byte version[])
{
return version[0] == DATA_FORMAT_VERSION[0]
&& version[2] == DATA_FORMAT_VERSION[2]
&& version[3] == DATA_FORMAT_VERSION[3];
}
public byte[] getUnicodeVersion(){
return unicodeVersion;
}
// private data members -------------------------------------------------
/**
* ICU data file input stream
*/
private DataInputStream dataInputStream;
private byte[] unicodeVersion;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
(byte)0x72, (byte)0x6D};
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
(byte)0x5, (byte)0x2};
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,140 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>Interface for enabling iteration over sets of <int index, int value>,
* where index is the sorted integer index in ascending order and value, its
* associated integer value.</p>
* <p>The result for each iteration is the consecutive range of
* <int index, int value> with the same value. Result is represented by
* <start, limit, value> where</p>
* <ul>
* <li> start is the starting integer of the result range
* <li> limit is 1 after the maximum integer that follows start, such that
* all integers between start and (limit - 1), inclusive, have the same
* associated integer value.
* <li> value is the integer value that all integers from start to (limit - 1)
* share in common.
* </ul>
* <p>
* Hence value(start) = value(start + 1) = .... = value(start + n) = .... =
* value(limit - 1). However value(start -1) != value(start) and
* value(limit) != value(start).
* </p>
* <p>Most implementations will be created by factory methods, such as the
* character type iterator in UCharacter.getTypeIterator. See example below.
* </p>
* Example of use:<br>
* <pre>
* RangeValueIterator iterator = UCharacter.getTypeIterator();
* RangeValueIterator.Element result = new RangeValueIterator.Element();
* while (iterator.next(result)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(result.start) +
* " to codepoint \\u" +
* Integer.toHexString(result.limit - 1) +
* " has the character type " + result.value);
* }
* </pre>
* @author synwee
* @stable ICU 2.6
*/
public interface RangeValueIterator
{
// public inner class ---------------------------------------------
/**
* Return result wrapper for com.ibm.icu.util.RangeValueIterator.
* Stores the start and limit of the continous result range and the
* common value all integers between [start, limit - 1] has.
* @stable ICU 2.6
*/
public class Element
{
// public data member ---------------------------------------------
/**
* Starting integer of the continuous result range that has the same
* value
* @stable ICU 2.6
*/
public int start;
/**
* (End + 1) integer of continuous result range that has the same
* value
* @stable ICU 2.6
*/
public int limit;
/**
* Gets the common value of the continous result range
* @stable ICU 2.6
*/
public int value;
// public constructor --------------------------------------------
/**
* Empty default constructor to make javadoc happy
* @stable ICU 2.4
*/
public Element()
{
}
}
// public methods -------------------------------------------------
/**
* <p>Gets the next maximal result range with a common value and returns
* true if we are not at the end of the iteration, false otherwise.</p>
* <p>If the return boolean is a false, the contents of elements will not
* be updated.</p>
* @param element for storing the result range and value
* @return true if we are not at the end of the iteration, false otherwise.
* @see Element
* @stable ICU 2.6
*/
public boolean next(Element element);
/**
* Resets the iterator to the beginning of the iteration.
* @stable ICU 2.6
*/
public void reset();
}

View File

@@ -0,0 +1,123 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>Replaceable</code> is an interface representing a
* string of characters that supports the replacement of a range of
* itself with a new string of characters. It is used by APIs that
* change a piece of text while retaining metadata. Metadata is data
* other than the Unicode characters returned by char32At(). One
* example of metadata is style attributes; another is an edit
* history, marking each character with an author and revision number.
*
* <p>An implicit aspect of the <code>Replaceable</code> API is that
* during a replace operation, new characters take on the metadata of
* the old characters. For example, if the string "the <b>bold</b>
* font" has range (4, 8) replaced with "strong", then it becomes "the
* <b>strong</b> font".
*
* <p><code>Replaceable</code> specifies ranges using a start
* offset and a limit offset. The range of characters thus specified
* includes the characters at offset start..limit-1. That is, the
* start offset is inclusive, and the limit offset is exclusive.
*
* <p><code>Replaceable</code> also includes API to access characters
* in the string: <code>length()</code>, <code>charAt()</code>,
* <code>char32At()</code>, and <code>extractBetween()</code>.
*
* <p>For a subclass to support metadata, typical behavior of
* <code>replace()</code> is the following:
* <ul>
* <li>Set the metadata of the new text to the metadata of the first
* character replaced</li>
* <li>If no characters are replaced, use the metadata of the
* previous character</li>
* <li>If there is no previous character (i.e. start == 0), use the
* following character</li>
* <li>If there is no following character (i.e. the replaceable was
* empty), use default metadata<br>
* <li>If the code point U+FFFF is seen, it should be interpreted as
* a special marker having no metadata<li>
* </li>
* </ul>
* If this is not the behavior, the subclass should document any differences.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @stable ICU 2.0
*/
public interface Replaceable {
/**
* Returns the number of 16-bit code units in the text.
* @return number of 16-bit code units in text
* @stable ICU 2.0
*/
int length();
/**
* Returns the 16-bit code unit at the given offset into the text.
* @param offset an integer between 0 and <code>length()</code>-1
* inclusive
* @return 16-bit code unit of text at given offset
* @stable ICU 2.0
*/
char charAt(int offset);
//// for StringPrep
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
* <code>srcStart</code>; the last character to be copied is at
* index <code>srcLimit-1</code> (thus the total number of
* characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code>
* starting at index <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive; <code>0
* <= start <= limit</code>.
* @param srcLimit the ending index to copy, exclusive;
* <code>start <= limit <= length()</code>.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
}

View File

@@ -0,0 +1,123 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>ReplaceableString</code> is an adapter class that implements the
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
*
* <p><em>Note:</em> This class does not support attributes and is not
* intended for general use. Most clients will need to implement
* {@link Replaceable} in their text representation class.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @see Replaceable
* @author Alan Liu
* @stable ICU 2.0
*/
public class ReplaceableString implements Replaceable {
private StringBuffer buf;
/**
* Construct a new object with the given initial contents.
* @param str initial contents
* @stable ICU 2.0
*/
public ReplaceableString(String str) {
buf = new StringBuffer(str);
}
//// for StringPrep
/**
* Construct a new object using <code>buf</code> for internal
* storage. The contents of <code>buf</code> at the time of
* construction are used as the initial contents. <em>Note!
* Modifications to <code>buf</code> will modify this object, and
* vice versa.</em>
* @param buf object to be used as internal storage
* @stable ICU 2.0
*/
public ReplaceableString(StringBuffer buf) {
this.buf = buf;
}
/**
* Return the number of characters contained in this object.
* <code>Replaceable</code> API.
* @stable ICU 2.0
*/
public int length() {
return buf.length();
}
/**
* Return the character at the given position in this object.
* <code>Replaceable</code> API.
* @param offset offset into the contents, from 0 to
* <code>length()</code> - 1
* @stable ICU 2.0
*/
public char charAt(int offset) {
return buf.charAt(offset);
}
//// for StringPrep
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
* <code>srcStart</code>; the last character to be copied is at
* index <code>srcLimit-1</code> (thus the total number of
* characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code>
* starting at index <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive; <code>0
* <= start <= limit</code>.
* @param srcLimit the ending index to copy, exclusive;
* <code>start <= limit <= length()</code>.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
Utility.getChars(buf, srcStart, srcLimit, dst, dstStart);
}
}

View File

@@ -0,0 +1,190 @@
/*
* Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* DLF docs must define behavior when Replaceable is mutated underneath
* the iterator.
*
* This and ICUCharacterIterator share some code, maybe they should share
* an implementation, or the common state and implementation should be
* moved up into UCharacterIterator.
*
* What are first, last, and getBeginIndex doing here?!?!?!
*/
public class ReplaceableUCharacterIterator extends UCharacterIterator {
// public constructor ------------------------------------------------------
/**
* Public constructor
* @param str text which the iterator will be based on
*/
public ReplaceableUCharacterIterator(String str){
if(str==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(str);
this.currentIndex = 0;
}
//// for StringPrep
/**
* Public constructor
* @param buf buffer of text on which the iterator will be based
*/
public ReplaceableUCharacterIterator(StringBuffer buf){
if(buf==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(buf);
this.currentIndex = 0;
}
// public methods ----------------------------------------------------------
/**
* Creates a copy of this iterator, does not clone the underlying
* <code>Replaceable</code>object
* @return copy of this iterator
*/
public Object clone(){
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null; // never invoked
}
}
/**
* Returns the current UTF16 character.
* @return current UTF16 character
*/
public int current(){
if (currentIndex < replaceable.length()) {
return replaceable.charAt(currentIndex);
}
return DONE;
}
/**
* Returns the length of the text
* @return length of the text
*/
public int getLength(){
return replaceable.length();
}
/**
* Gets the current currentIndex in text.
* @return current currentIndex in text.
*/
public int getIndex(){
return currentIndex;
}
/**
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
* If the resulting currentIndex is greater or equal to the text length, the
* currentIndex is reset to the text length and a value of DONECODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* end of the text range.
*/
public int next(){
if (currentIndex < replaceable.length()) {
return replaceable.charAt(currentIndex++);
}
return DONE;
}
/**
* Returns previous UTF16 character and decrements the iterator's currentIndex by
* 1.
* If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a
* value of DONECODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* start of the text range.
*/
public int previous(){
if (currentIndex > 0) {
return replaceable.charAt(--currentIndex);
}
return DONE;
}
/**
* <p>Sets the currentIndex to the specified currentIndex in the text and returns that
* single UTF16 character at currentIndex.
* This assumes the text is stored as 16-bit code units.</p>
* @param currentIndex the currentIndex within the text.
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
* supplied. i.e. currentIndex is out of bounds.
* @return the character at the specified currentIndex or DONE if the specified
* currentIndex is equal to the end of the text.
*/
public void setIndex(int currentIndex) {
if (currentIndex < 0 || currentIndex > replaceable.length()) {
throw new IllegalArgumentException();
}
this.currentIndex = currentIndex;
}
//// for StringPrep
public int getText(char[] fillIn, int offset){
int length = replaceable.length();
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
replaceable.getChars(0,length,fillIn,offset);
return length;
}
// private data members ----------------------------------------------------
/**
* Replaceable object
*/
private Replaceable replaceable;
/**
* Current currentIndex
*/
private int currentIndex;
}

View File

@@ -0,0 +1,367 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
/*
**********************************************************************
* Author: Alan Liu
* Created: September 23 2003
* Since: ICU 2.8
**********************************************************************
*/
package sun.text.normalizer;
import java.text.ParsePosition;
/**
* An iterator that returns 32-bit code points. This class is deliberately
* <em>not</em> related to any of the JDK or ICU4J character iterator classes
* in order to minimize complexity.
* @author Alan Liu
* @since ICU 2.8
*/
public class RuleCharacterIterator {
// TODO: Ideas for later. (Do not implement if not needed, lest the
// code coverage numbers go down due to unused methods.)
// 1. Add a copy constructor, equals() method, clone() method.
// 2. Rather than return DONE, throw an exception if the end
// is reached -- this is an alternate usage model, probably not useful.
// 3. Return isEscaped from next(). If this happens,
// don't keep an isEscaped member variable.
/**
* Text being iterated.
*/
private String text;
/**
* Position of iterator.
*/
private ParsePosition pos;
/**
* Symbol table used to parse and dereference variables. May be null.
*/
private SymbolTable sym;
/**
* Current variable expansion, or null if none.
*/
private char[] buf;
/**
* Position within buf[]. Meaningless if buf == null.
*/
private int bufPos;
/**
* Flag indicating whether the last character was parsed from an escape.
*/
private boolean isEscaped;
/**
* Value returned when there are no more characters to iterate.
*/
public static final int DONE = -1;
/**
* Bitmask option to enable parsing of variable names. If (options &
* PARSE_VARIABLES) != 0, then an embedded variable will be expanded to
* its value. Variables are parsed using the SymbolTable API.
*/
public static final int PARSE_VARIABLES = 1;
/**
* Bitmask option to enable parsing of escape sequences. If (options &
* PARSE_ESCAPES) != 0, then an embedded escape sequence will be expanded
* to its value. Escapes are parsed using Utility.unescapeAt().
*/
public static final int PARSE_ESCAPES = 2;
/**
* Bitmask option to enable skipping of whitespace. If (options &
* SKIP_WHITESPACE) != 0, then whitespace characters will be silently
* skipped, as if they were not present in the input. Whitespace
* characters are defined by UCharacterProperty.isRuleWhiteSpace().
*/
public static final int SKIP_WHITESPACE = 4;
/**
* Constructs an iterator over the given text, starting at the given
* position.
* @param text the text to be iterated
* @param sym the symbol table, or null if there is none. If sym is null,
* then variables will not be deferenced, even if the PARSE_VARIABLES
* option is set.
* @param pos upon input, the index of the next character to return. If a
* variable has been dereferenced, then pos will <em>not</em> increment as
* characters of the variable value are iterated.
*/
public RuleCharacterIterator(String text, SymbolTable sym,
ParsePosition pos) {
if (text == null || pos.getIndex() > text.length()) {
throw new IllegalArgumentException();
}
this.text = text;
this.sym = sym;
this.pos = pos;
buf = null;
}
/**
* Returns true if this iterator has no more characters to return.
*/
public boolean atEnd() {
return buf == null && pos.getIndex() == text.length();
}
/**
* Returns the next character using the given options, or DONE if there
* are no more characters, and advance the position to the next
* character.
* @param options one or more of the following options, bitwise-OR-ed
* together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
* @return the current 32-bit code point, or DONE
*/
public int next(int options) {
int c = DONE;
isEscaped = false;
for (;;) {
c = _current();
_advance(UTF16.getCharCount(c));
if (c == SymbolTable.SYMBOL_REF && buf == null &&
(options & PARSE_VARIABLES) != 0 && sym != null) {
String name = sym.parseReference(text, pos, text.length());
// If name == null there was an isolated SYMBOL_REF;
// return it. Caller must be prepared for this.
if (name == null) {
break;
}
bufPos = 0;
buf = sym.lookup(name);
if (buf == null) {
throw new IllegalArgumentException(
"Undefined variable: " + name);
}
// Handle empty variable value
if (buf.length == 0) {
buf = null;
}
continue;
}
if ((options & SKIP_WHITESPACE) != 0 &&
UCharacterProperty.isRuleWhiteSpace(c)) {
continue;
}
if (c == '\\' && (options & PARSE_ESCAPES) != 0) {
int offset[] = new int[] { 0 };
c = Utility.unescapeAt(lookahead(), offset);
jumpahead(offset[0]);
isEscaped = true;
if (c < 0) {
throw new IllegalArgumentException("Invalid escape");
}
}
break;
}
return c;
}
/**
* Returns true if the last character returned by next() was
* escaped. This will only be the case if the option passed in to
* next() included PARSE_ESCAPED and the next character was an
* escape sequence.
*/
public boolean isEscaped() {
return isEscaped;
}
/**
* Returns true if this iterator is currently within a variable expansion.
*/
public boolean inVariable() {
return buf != null;
}
/**
* Returns an object which, when later passed to setPos(), will
* restore this iterator's position. Usage idiom:
*
* RuleCharacterIterator iterator = ...;
* Object pos = iterator.getPos(null); // allocate position object
* for (;;) {
* pos = iterator.getPos(pos); // reuse position object
* int c = iterator.next(...);
* ...
* }
* iterator.setPos(pos);
*
* @param p a position object previously returned by getPos(),
* or null. If not null, it will be updated and returned. If
* null, a new position object will be allocated and returned.
* @return a position object which may be passed to setPos(),
* either `p,' or if `p' == null, a newly-allocated object
*/
public Object getPos(Object p) {
if (p == null) {
return new Object[] {buf, new int[] {pos.getIndex(), bufPos}};
}
Object[] a = (Object[]) p;
a[0] = buf;
int[] v = (int[]) a[1];
v[0] = pos.getIndex();
v[1] = bufPos;
return p;
}
/**
* Restores this iterator to the position it had when getPos()
* returned the given object.
* @param p a position object previously returned by getPos()
*/
public void setPos(Object p) {
Object[] a = (Object[]) p;
buf = (char[]) a[0];
int[] v = (int[]) a[1];
pos.setIndex(v[0]);
bufPos = v[1];
}
/**
* Skips ahead past any ignored characters, as indicated by the given
* options. This is useful in conjunction with the lookahead() method.
*
* Currently, this only has an effect for SKIP_WHITESPACE.
* @param options one or more of the following options, bitwise-OR-ed
* together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE.
*/
public void skipIgnored(int options) {
if ((options & SKIP_WHITESPACE) != 0) {
for (;;) {
int a = _current();
if (!UCharacterProperty.isRuleWhiteSpace(a)) break;
_advance(UTF16.getCharCount(a));
}
}
}
/**
* Returns a string containing the remainder of the characters to be
* returned by this iterator, without any option processing. If the
* iterator is currently within a variable expansion, this will only
* extend to the end of the variable expansion. This method is provided
* so that iterators may interoperate with string-based APIs. The typical
* sequence of calls is to call skipIgnored(), then call lookahead(), then
* parse the string returned by lookahead(), then call jumpahead() to
* resynchronize the iterator.
* @return a string containing the characters to be returned by future
* calls to next()
*/
public String lookahead() {
if (buf != null) {
return new String(buf, bufPos, buf.length - bufPos);
} else {
return text.substring(pos.getIndex());
}
}
/**
* Advances the position by the given number of 16-bit code units.
* This is useful in conjunction with the lookahead() method.
* @param count the number of 16-bit code units to jump over
*/
public void jumpahead(int count) {
if (count < 0) {
throw new IllegalArgumentException();
}
if (buf != null) {
bufPos += count;
if (bufPos > buf.length) {
throw new IllegalArgumentException();
}
if (bufPos == buf.length) {
buf = null;
}
} else {
int i = pos.getIndex() + count;
pos.setIndex(i);
if (i > text.length()) {
throw new IllegalArgumentException();
}
}
}
/**
* Returns the current 32-bit code point without parsing escapes, parsing
* variables, or skipping whitespace.
* @return the current 32-bit code point
*/
private int _current() {
if (buf != null) {
return UTF16.charAt(buf, 0, buf.length, bufPos);
} else {
int i = pos.getIndex();
return (i < text.length()) ? UTF16.charAt(text, i) : DONE;
}
}
/**
* Advances the position by the given amount.
* @param count the number of 16-bit code units to advance past
*/
private void _advance(int count) {
if (buf != null) {
bufPos += count;
if (bufPos == buf.length) {
buf = null;
}
} else {
pos.setIndex(pos.getIndex() + count);
if (pos.getIndex() > text.length()) {
pos.setIndex(text.length());
}
}
}
}

View File

@@ -0,0 +1,124 @@
/*
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.ParsePosition;
/**
* An interface that defines both lookup protocol and parsing of
* symbolic names.
*
* <p>A symbol table maintains two kinds of mappings. The first is
* between symbolic names and their values. For example, if the
* variable with the name "start" is set to the value "alpha"
* (perhaps, though not necessarily, through an expression such as
* "$start=alpha"), then the call lookup("start") will return the
* char[] array ['a', 'l', 'p', 'h', 'a'].
*
* <p>The second kind of mapping is between character values and
* UnicodeMatcher objects. This is used by RuleBasedTransliterator,
* which uses characters in the private use area to represent objects
* such as UnicodeSets. If U+E015 is mapped to the UnicodeSet [a-z],
* then lookupMatcher(0xE015) will return the UnicodeSet [a-z].
*
* <p>Finally, a symbol table defines parsing behavior for symbolic
* names. All symbolic names start with the SYMBOL_REF character.
* When a parser encounters this character, it calls parseReference()
* with the position immediately following the SYMBOL_REF. The symbol
* table parses the name, if there is one, and returns it.
*
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public interface SymbolTable {
/**
* The character preceding a symbol reference name.
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
static final char SYMBOL_REF = '$';
/**
* Lookup the characters associated with this string and return it.
* Return <tt>null</tt> if no such name exists. The resultant
* array may have length zero.
* @param s the symbolic name to lookup
* @return a char array containing the name's value, or null if
* there is no mapping for s.
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
char[] lookup(String s);
/**
* Lookup the UnicodeMatcher associated with the given character, and
* return it. Return <tt>null</tt> if not found.
* @param ch a 32-bit code point from 0 to 0x10FFFF inclusive.
* @return the UnicodeMatcher object represented by the given
* character, or null if there is no mapping for ch.
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
UnicodeMatcher lookupMatcher(int ch);
/**
* Parse a symbol reference name from the given string, starting
* at the given position. If no valid symbol reference name is
* found, return null and leave pos unchanged. That is, if the
* character at pos cannot start a name, or if pos is at or after
* text.length(), then return null. This indicates an isolated
* SYMBOL_REF character.
* @param text the text to parse for the name
* @param pos on entry, the index of the first character to parse.
* This is the character following the SYMBOL_REF character. On
* exit, the index after the last parsed character. If the parse
* failed, pos is unchanged on exit.
* @param limit the index after the last character to be parsed.
* @return the parsed name, or null if there is no valid symbolic
* name at the given position.
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
String parseReference(String text, ParsePosition pos, int limit);
}

View File

@@ -0,0 +1,419 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* <p>A trie is a kind of compressed, serializable table of values
* associated with Unicode code points (0..0x10ffff).</p>
* <p>This class defines the basic structure of a trie and provides methods
* to <b>retrieve the offsets to the actual data</b>.</p>
* <p>Data will be the form of an array of basic types, char or int.</p>
* <p>The actual data format will have to be specified by the user in the
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
* <p>This trie implementation is optimized for getting offset while walking
* forward through a UTF-16 string.
* Therefore, the simplest and fastest access macros are the
* fromLead() and fromOffsetTrail() methods.
* The fromBMP() method are a little more complicated; they get offsets even
* for lead surrogate codepoints, while the fromLead() method get special
* "folded" offsets for lead surrogate code units if there is relevant data
* associated with them.
* From such a folded offsets, an offset needs to be extracted to supply
* to the fromOffsetTrail() methods.
* To handle such supplementary codepoints, some offset information are kept
* in the data.</p>
* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
* that offset from the folded value for the lead surrogate unit.</p>
* <p>For examples of use, see com.ibm.icu.impl.CharTrie or
* com.ibm.icu.impl.IntTrie.</p>
* @author synwee
* @see com.ibm.icu.impl.CharTrie
* @see com.ibm.icu.impl.IntTrie
* @since release 2.1, Jan 01 2002
*/
public abstract class Trie
{
// public class declaration ----------------------------------------
/**
* Character data in com.ibm.impl.Trie have different user-specified format
* for different purposes.
* This interface specifies methods to be implemented in order for
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
* the data.
*/
public static interface DataManipulate
{
/**
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
* data
* the index array offset of the indexes for that lead surrogate.
* @param value data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value);
}
// default implementation
private static class DefaultGetFoldingOffset implements DataManipulate {
public int getFoldingOffset(int value) {
return value;
}
}
// protected constructor -------------------------------------------
/**
* Trie constructor for CharTrie use.
* @param inputStream ICU data file input stream which contains the
* trie
* @param dataManipulate object containing the information to parse the
* trie data
* @throws IOException thrown when input stream does not have the
* right header.
*/
protected Trie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
// Magic number to authenticate the data.
int signature = input.readInt();
m_options_ = input.readInt();
if (!checkHeader(signature)) {
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
}
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_dataOffset_ = input.readInt();
m_dataLength_ = input.readInt();
unserialize(inputStream);
}
/**
* Trie constructor
* @param index array to be used for index
* @param options used by the trie
* @param dataManipulate object containing the information to parse the
* trie data
*/
protected Trie(char index[], int options, DataManipulate dataManipulate)
{
m_options_ = options;
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_index_ = index;
m_dataOffset_ = m_index_.length;
}
// protected data members ------------------------------------------
/**
* Lead surrogate code points' index displacement in the index array.
* 0x10000-0xd800=0x2800
* 0x2800 >> INDEX_STAGE_1_SHIFT_
*/
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
* Shift size for shifting right the input index. 1..9
*/
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires blocks of stage 2 data to be aligned by
* DATA_GRANULARITY.
* 0..INDEX_STAGE_1_SHIFT
*/
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
/**
* Mask for getting the lower bits from the input index.
* DATA_BLOCK_LENGTH - 1.
*/
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
/** Number of bits of a trail surrogate that are used in index table lookups. */
protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
/**
* Number of index (stage 1) entries per lead surrogate.
* Same as number of index entries for 1024 trail surrogates,
* ==0x400>>INDEX_STAGE_1_SHIFT_
*/
protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
/** Length of the BMP portion of the index (stage 1) array. */
protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
/**
* Surrogate mask to use when shifting offset to retrieve supplementary
* values
*/
protected static final int SURROGATE_MASK_ = 0x3FF;
/**
* Index or UTF16 characters
*/
protected char m_index_[];
/**
* Internal TrieValue which handles the parsing of the data value.
* This class is to be implemented by the user
*/
protected DataManipulate m_dataManipulate_;
/**
* Start index of the data portion of the trie. CharTrie combines
* index and data into a char array, so this is used to indicate the
* initial offset to the data portion.
* Note this index always points to the initial value.
*/
protected int m_dataOffset_;
/**
* Length of the data array
*/
protected int m_dataLength_;
// protected methods -----------------------------------------------
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
*/
protected abstract int getSurrogateOffset(char lead, char trail);
/**
* Gets the value at the argument index
* @param index value at index will be retrieved
* @return 32 bit value
*/
protected abstract int getValue(int index);
/**
* Gets the default initial value
* @return 32 bit value
*/
protected abstract int getInitialValue();
/**
* Gets the offset to the data which the index ch after variable offset
* points to.
* Note for locating a non-supplementary character data offset, calling
* <p>
* getRawOffset(0, ch);
* </p>
* will do. Otherwise if it is a supplementary character formed by
* surrogates lead and trail. Then we would have to call getRawOffset()
* with getFoldingIndexOffset(). See getSurrogateOffset().
* @param offset index offset which ch is to start from
* @param ch index to be used after offset
* @return offset to the data
*/
protected final int getRawOffset(int offset, char ch)
{
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
<< INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
}
/**
* Gets the offset to data which the BMP character points to
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
*/
protected final int getBMPOffset(char ch)
{
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
: getRawOffset(0, ch);
// using a getRawOffset(ch) makes no diff
}
/**
* Gets the offset to the data which this lead surrogate character points
* to.
* Data at the returned offset may contain folding offset information for
* the next trailing surrogate character.
* @param ch lead surrogate character
* @return offset to data
*/
protected final int getLeadOffset(char ch)
{
return getRawOffset(0, ch);
}
/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
* if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
*/
protected final int getCodePointOffset(int ch)
{
// if ((ch >> 16) == 0) slower
if (ch < 0) {
return -1;
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
return getRawOffset(0, (char)ch);
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
// BMP codepoint
return getBMPOffset((char)ch);
} else if (ch <= UCharacter.MAX_VALUE) {
// look at the construction of supplementary characters
// trail forms the ends of it.
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
(char)(ch & SURROGATE_MASK_));
} else {
// return -1 // if there is an error, in this case we return
return -1;
}
}
/**
* <p>Parses the inputstream and creates the trie index with it.</p>
* <p>This is overwritten by the child classes.
* @param inputStream input stream containing the trie information
* @exception IOException thrown when data reading fails.
*/
protected void unserialize(InputStream inputStream) throws IOException
{
//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
m_index_ = new char[m_dataOffset_];
DataInputStream input = new DataInputStream(inputStream);
for (int i = 0; i < m_dataOffset_; i ++) {
m_index_[i] = input.readChar();
}
}
/**
* Determines if this is a 32 bit trie
* @return true if options specifies this is a 32 bit trie
*/
protected final boolean isIntTrie()
{
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
}
/**
* Determines if this is a 16 bit trie
* @return true if this is a 16 bit trie
*/
protected final boolean isCharTrie()
{
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
}
// private data members --------------------------------------------
/**
* Latin 1 option mask
*/
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
/**
* Constant number to authenticate the byte block
*/
protected static final int HEADER_SIGNATURE_ = 0x54726965;
/**
* Header option formatting
*/
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
/**
* Flag indicator for Latin quick access data block
*/
private boolean m_isLatin1Linear_;
/**
* <p>Trie options field.</p>
* <p>options bit field:<br>
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
* 8 0 = 16-bit data, 1=32-bit data<br>
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
*/
private int m_options_;
// private methods ---------------------------------------------------
/**
* Authenticates raw data header.
* Checking the header information, signature and options.
* @param signature This contains the options and type of a Trie
* @return true if the header is authenticated valid
*/
private final boolean checkHeader(int signature)
{
// check the signature
// Trie in big-endian US-ASCII (0x54726965).
// Magic number to authenticate the data.
if (signature != HEADER_SIGNATURE_) {
return false;
}
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
INDEX_STAGE_1_SHIFT_ ||
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
HEADER_OPTIONS_SHIFT_MASK_)
!= INDEX_STAGE_2_SHIFT_) {
return false;
}
return true;
}
}

View File

@@ -0,0 +1,548 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>Class enabling iteration of the values in a Trie.</p>
* <p>Result of each iteration contains the interval of codepoints that have
* the same value type and the value type itself.</p>
* <p>The comparison of each codepoint value is done via extract(), which the
* default implementation is to return the value as it is.</p>
* <p>Method extract() can be overwritten to perform manipulations on
* codepoint values in order to perform specialized comparison.</p>
* <p>TrieIterator is designed to be a generic iterator for the CharTrie
* and the IntTrie, hence to accommodate both types of data, the return
* result will be in terms of int (32 bit) values.</p>
* <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
* <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
* Internally, icu4c's utrie_enum performs all iterations in its body. In Java
* sense, the caller will have to pass a object with a callback function
* UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
* uint32_t value) into utrie_enum. utrie_enum will then find ranges of
* codepoints with the same value as determined by
* UTrieEnumValue(const void *context, uint32_t value). for each range,
* utrie_enum calls the callback function to perform a task. In this way,
* icu4c performs the iteration within utrie_enum.
* To follow the JDK model, icu4j is slightly different from icu4c.
* Instead of requesting the caller to implement an object for a callback.
* The caller will have to implement a subclass of TrieIterator, fleshing out
* the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
* the caller will have to code his own iteration and flesh out the task
* (equivalent to UTrieEnumRange) to be performed in the iteration loop.
* </p>
* <p>There are basically 3 usage scenarios for porting:</p>
* <p>1) UTrieEnumValue is the only implemented callback then just implement a
* subclass of TrieIterator and override the extract(int) method. The
* extract(int) method is analogus to UTrieEnumValue callback.
* </p>
* <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
* a subclass of TrieIterator, override the extract method and iterate, e.g
* </p>
* <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
* set);<br>
* In Java :<br>
* <pre>
* class TrieIteratorImpl extends TrieIterator{
* public TrieIteratorImpl(Trie data){
* super(data);
* }
* public int extract(int value){
* // port the implementation of _enumPropertyStartsValue here
* }
* }
* ....
* TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
* while(fcdIter.next(result)) {
* // port the implementation of _enumPropertyStartsRange
* }
* </pre>
* </p>
* <p>3) UTrieEnumRange is the only implemented callback then just implement
* the while loop, when utrie_enum is called
* <pre>
* // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
* TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
* while(fcdIter.next(result)){
* set.add(result.start);
* }
* </pre>
* </p>
* @author synwee
* @see com.ibm.icu.impl.Trie
* @see com.ibm.icu.lang.UCharacterTypeIterator
* @since release 2.1, Jan 17 2002
*/
public class TrieIterator implements RangeValueIterator
{
// public constructor ---------------------------------------------
/**
* TrieEnumeration constructor
* @param trie to be used
* @exception IllegalArgumentException throw when argument is null.
*/
public TrieIterator(Trie trie)
{
if (trie == null) {
throw new IllegalArgumentException(
"Argument trie cannot be null");
}
m_trie_ = trie;
// synwee: check that extract belongs to the child class
m_initialValue_ = extract(m_trie_.getInitialValue());
reset();
}
// public methods -------------------------------------------------
/**
* <p>Returns true if we are not at the end of the iteration, false
* otherwise.</p>
* <p>The next set of codepoints with the same value type will be
* calculated during this call and returned in the arguement element.</p>
* @param element return result
* @return true if we are not at the end of the iteration, false otherwise.
* @exception NoSuchElementException - if no more elements exist.
* @see com.ibm.icu.util.RangeValueIterator.Element
*/
public final boolean next(Element element)
{
if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
return false;
}
if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
calculateNextBMPElement(element)) {
return true;
}
calculateNextSupplementaryElement(element);
return true;
}
/**
* Resets the iterator to the beginning of the iteration
*/
public final void reset()
{
m_currentCodepoint_ = 0;
m_nextCodepoint_ = 0;
m_nextIndex_ = 0;
m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
if (m_nextBlock_ == 0) {
m_nextValue_ = m_initialValue_;
}
else {
m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
}
m_nextBlockIndex_ = 0;
m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
}
// protected methods ----------------------------------------------
/**
* Called by next() to extracts a 32 bit value from a trie value
* used for comparison.
* This method is to be overwritten if special manipulation is to be done
* to retrieve a relevant comparison.
* The default function is to return the value as it is.
* @param value a value from the trie
* @return extracted value
*/
protected int extract(int value)
{
return value;
}
// private methods ------------------------------------------------
/**
* Set the result values
* @param element return result object
* @param start codepoint of range
* @param limit (end + 1) codepoint of range
* @param value common value of range
*/
private final void setResult(Element element, int start, int limit,
int value)
{
element.start = start;
element.limit = limit;
element.value = value;
}
/**
* Finding the next element.
* This method is called just before returning the result of
* next().
* We always store the next element before it is requested.
* In the case that we have to continue calculations into the
* supplementary planes, a false will be returned.
* @param element return result object
* @return true if the next range is found, false if we have to proceed to
* the supplementary range.
*/
private final boolean calculateNextBMPElement(Element element)
{
int currentBlock = m_nextBlock_;
int currentValue = m_nextValue_;
m_currentCodepoint_ = m_nextCodepoint_;
m_nextCodepoint_ ++;
m_nextBlockIndex_ ++;
if (!checkBlockDetail(currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
return true;
}
// synwee check that next block index == 0 here
// enumerate BMP - the main loop enumerates data blocks
while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
m_nextIndex_ ++;
// because of the way the character is split to form the index
// the lead surrogate and trail surrogate can not be in the
// mid of a block
if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
// skip lead surrogate code units,
// go to lead surrogate codepoints
m_nextIndex_ = BMP_INDEX_LENGTH_;
}
else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
// go back to regular BMP code points
m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
}
m_nextBlockIndex_ = 0;
if (!checkBlock(currentBlock, currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
return true;
}
}
m_nextCodepoint_ --; // step one back since this value has not been
m_nextBlockIndex_ --; // retrieved yet.
return false;
}
/**
* Finds the next supplementary element.
* For each entry in the trie, the value to be delivered is passed through
* extract().
* We always store the next element before it is requested.
* Called after calculateNextBMP() completes its round of BMP characters.
* There is a slight difference in the usage of m_currentCodepoint_
* here as compared to calculateNextBMP(). Though both represents the
* lower bound of the next element, in calculateNextBMP() it gets set
* at the start of any loop, where-else, in calculateNextSupplementary()
* since m_currentCodepoint_ already contains the lower bound of the
* next element (passed down from calculateNextBMP()), we keep it till
* the end before resetting it to the new value.
* Note, if there are no more iterations, it will never get to here.
* Blocked out by next().
* @param element return result object
*/
private final void calculateNextSupplementaryElement(Element element)
{
int currentValue = m_nextValue_;
int currentBlock = m_nextBlock_;
m_nextCodepoint_ ++;
m_nextBlockIndex_ ++;
if (UTF16.getTrailSurrogate(m_nextCodepoint_)
!= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
// this piece is only called when we are in the middle of a lead
// surrogate block
if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
// we have cleared one block
m_nextIndex_ ++;
m_nextTrailIndexOffset_ ++;
if (!checkTrailBlock(currentBlock, currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
}
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
// enumerate supplementary code points
while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
// lead surrogate access
int leadBlock =
m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
Trie.INDEX_STAGE_2_SHIFT_;
if (leadBlock == m_trie_.m_dataOffset_) {
// no entries for a whole block of lead surrogates
if (currentValue != m_initialValue_) {
m_nextValue_ = m_initialValue_;
m_nextBlock_ = 0;
m_nextBlockIndex_ = 0;
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
nextLead += DATA_BLOCK_LENGTH_;
// number of total affected supplementary codepoints in one
// block
// this is not a simple addition of
// DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
// that we might have moved some of the codepoints
m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
(char)nextLead,
(char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
continue;
}
if (m_trie_.m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
// enumerate trail surrogates for this lead surrogate
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
m_trie_.getValue(leadBlock +
(nextLead & Trie.INDEX_STAGE_3_MASK_)));
if (m_nextIndex_ <= 0) {
// no data for this lead surrogate
if (currentValue != m_initialValue_) {
m_nextValue_ = m_initialValue_;
m_nextBlock_ = 0;
m_nextBlockIndex_ = 0;
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
} else {
m_nextTrailIndexOffset_ = 0;
if (!checkTrailBlock(currentBlock, currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
}
nextLead ++;
}
// deliver last range
setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
currentValue);
}
/**
* Internal block value calculations
* Performs calculations on a data block to find codepoints in m_nextBlock_
* after the index m_nextBlockIndex_ that has the same value.
* Note m_*_ variables at this point is the next codepoint whose value
* has not been calculated.
* But when returned with false, it will be the last codepoint whose
* value has been calculated.
* @param currentValue the value which other codepoints are tested against
* @return true if the whole block has the same value as currentValue or if
* the whole block has been calculated, false otherwise.
*/
private final boolean checkBlockDetail(int currentValue)
{
while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
m_nextBlockIndex_));
if (m_nextValue_ != currentValue) {
return false;
}
++ m_nextBlockIndex_;
++ m_nextCodepoint_;
}
return true;
}
/**
* Internal block value calculations
* Performs calculations on a data block to find codepoints in m_nextBlock_
* that has the same value.
* Will call checkBlockDetail() if highlevel check fails.
* Note m_*_ variables at this point is the next codepoint whose value
* has not been calculated.
* @param currentBlock the initial block containing all currentValue
* @param currentValue the value which other codepoints are tested against
* @return true if the whole block has the same value as currentValue or if
* the whole block has been calculated, false otherwise.
*/
private final boolean checkBlock(int currentBlock, int currentValue)
{
m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
Trie.INDEX_STAGE_2_SHIFT_;
if (m_nextBlock_ == currentBlock &&
(m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
// the block is the same as the previous one, filled with
// currentValue
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
}
else if (m_nextBlock_ == 0) {
// this is the all-initial-value block
if (currentValue != m_initialValue_) {
m_nextValue_ = m_initialValue_;
m_nextBlockIndex_ = 0;
return false;
}
m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
}
else {
if (!checkBlockDetail(currentValue)) {
return false;
}
}
return true;
}
/**
* Internal block value calculations
* Performs calculations on multiple data blocks for a set of trail
* surrogates to find codepoints in m_nextBlock_ that has the same value.
* Will call checkBlock() for internal block checks.
* Note m_*_ variables at this point is the next codepoint whose value
* has not been calculated.
* @param currentBlock the initial block containing all currentValue
* @param currentValue the value which other codepoints are tested against
* @return true if the whole block has the same value as currentValue or if
* the whole block has been calculated, false otherwise.
*/
private final boolean checkTrailBlock(int currentBlock,
int currentValue)
{
// enumerate code points for this lead surrogate
while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
{
// if we ever reach here, we are at the start of a new block
m_nextBlockIndex_ = 0;
// copy of most of the body of the BMP loop
if (!checkBlock(currentBlock, currentValue)) {
return false;
}
m_nextTrailIndexOffset_ ++;
m_nextIndex_ ++;
}
return true;
}
/**
* Checks if we are beginning at the start of a initial block.
* If we are then the rest of the codepoints in this initial block
* has the same values.
* We increment m_nextCodepoint_ and relevant data members if so.
* This is used only in for the supplementary codepoints because
* the offset to the trail indexes could be 0.
* @return true if we are at the start of a initial block.
*/
private final boolean checkNullNextTrailIndex()
{
if (m_nextIndex_ <= 0) {
m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
int leadBlock =
m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
Trie.INDEX_STAGE_2_SHIFT_;
if (m_trie_.m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
m_trie_.getValue(leadBlock +
(nextLead & Trie.INDEX_STAGE_3_MASK_)));
m_nextIndex_ --;
m_nextBlockIndex_ = DATA_BLOCK_LENGTH_;
return true;
}
return false;
}
// private data members --------------------------------------------
/**
* Size of the stage 1 BMP indexes
*/
private static final int BMP_INDEX_LENGTH_ =
0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
/**
* Lead surrogate minimum value
*/
private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
/**
* Trail surrogate minimum value
*/
private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
/**
* Number of trail surrogate
*/
private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
/**
* Number of stage 1 indexes for supplementary calculations that maps to
* each lead surrogate character.
* See second pass into getRawOffset for the trail surrogate character.
* 10 for significant number of bits for trail surrogates, 5 for what we
* discard during shifting.
*/
private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
/**
* Number of data values in a stage 2 (data array) block.
*/
private static final int DATA_BLOCK_LENGTH_ =
1 << Trie.INDEX_STAGE_1_SHIFT_;
/**
* Trie instance
*/
private Trie m_trie_;
/**
* Initial value for trie values
*/
private int m_initialValue_;
/**
* Next element results and data.
*/
private int m_currentCodepoint_;
private int m_nextCodepoint_;
private int m_nextValue_;
private int m_nextIndex_;
private int m_nextBlock_;
private int m_nextBlockIndex_;
private int m_nextTrailIndexOffset_;
}

View File

@@ -0,0 +1,179 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
* file name: UBiDiProps.java
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2005jan16
* created by: Markus W. Scherer
*
* Low-level Unicode bidi/shaping properties access.
* Java port of ubidi_props.h/.c.
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
public final class UBiDiProps {
// constructors etc. --------------------------------------------------- ***
// port of ubidi_openProps()
public UBiDiProps() throws IOException{
InputStream is=ICUData.getStream(DATA_FILE_NAME);
BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
readData(b);
b.close();
is.close();
}
private void readData(InputStream is) throws IOException {
DataInputStream inputStream=new DataInputStream(is);
// read the header
ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
// read indexes[]
int i, count;
count=inputStream.readInt();
if(count<IX_INDEX_TOP) {
throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
}
indexes=new int[count];
indexes[0]=count;
for(i=1; i<count; ++i) {
indexes[i]=inputStream.readInt();
}
// read the trie
trie=new CharTrie(inputStream, null);
// read mirrors[]
count=indexes[IX_MIRROR_LENGTH];
if(count>0) {
mirrors=new int[count];
for(i=0; i<count; ++i) {
mirrors[i]=inputStream.readInt();
}
}
// read jgArray[]
count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START];
jgArray=new byte[count];
for(i=0; i<count; ++i) {
jgArray[i]=inputStream.readByte();
}
}
// implement ICUBinary.Authenticate
private final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==1 &&
version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_;
}
}
// UBiDiProps singleton
private static UBiDiProps gBdp=null;
// port of ubidi_getSingleton()
public static final synchronized UBiDiProps getSingleton() throws IOException {
if(gBdp==null) {
gBdp=new UBiDiProps();
}
return gBdp;
}
// UBiDiProps dummy singleton
private static UBiDiProps gBdpDummy=null;
private UBiDiProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature
indexes=new int[IX_TOP];
indexes[0]=IX_TOP;
trie=new CharTrie(0, 0, null); // dummy trie, always returns 0
}
/**
* Get a singleton dummy object, one that works with no real data.
* This can be used when the real data is not available.
* Using the dummy can reduce checks for available data after an initial failure.
* Port of ucase_getDummy().
*/
public static final synchronized UBiDiProps getDummy() {
if(gBdpDummy==null) {
gBdpDummy=new UBiDiProps(true);
}
return gBdpDummy;
}
public final int getClass(int c) {
return getClassFromProps(trie.getCodePointValue(c));
}
// data members -------------------------------------------------------- ***
private int indexes[];
private int mirrors[];
private byte jgArray[];
private CharTrie trie;
// data format constants ----------------------------------------------- ***
private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu";
/* format "BiDi" */
private static final byte FMT[]={ 0x42, 0x69, 0x44, 0x69 };
/* indexes into indexes[] */
private static final int IX_INDEX_TOP=0;
private static final int IX_MIRROR_LENGTH=3;
private static final int IX_JG_START=4;
private static final int IX_JG_LIMIT=5;
private static final int IX_TOP=16;
private static final int CLASS_MASK= 0x0000001f;
private static final int getClassFromProps(int props) {
return props&CLASS_MASK;
}
}

View File

@@ -0,0 +1,431 @@
/*
* Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.util.MissingResourceException;
/**
* <p>
* The UCharacter class provides extensions to the
* <a href="https://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html">
* java.lang.Character</a> class. These extensions provide support for
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
* class, provide support for supplementary characters (those with code
* points above U+FFFF).
* Each ICU release supports the latest version of Unicode available at that time.
* </p>
* <p>
* Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
* </p>
* <p>
* To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.<br>
* E.g. In Windows <br>
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
* Otherwise, another method would be to copy the files uprops.dat and
* unames.icu from the icu4j source subdirectory
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
* </p>
* <p>
* Aside from the additions for UTF-16 support, and the updated Unicode
* properties, the main differences between UCharacter and Character are:
* <ul>
* <li> UCharacter is not designed to be a char wrapper and does not have
* APIs to which involves management of that single char.<br>
* These include:
* <ul>
* <li> char charValue(),
* <li> int compareTo(java.lang.Character, java.lang.Character), etc.
* </ul>
* <li> UCharacter does not include Character APIs that are deprecated, nor
* does it include the Java-specific character information, such as
* boolean isJavaIdentifierPart(char ch).
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
* values '10' - '35'. UCharacter also does this in digit and
* getNumericValue, to adhere to the java semantics of these
* methods. New methods unicodeDigit, and
* getUnicodeNumericValue do not treat the above code points
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* </ul>
* <p>
* Further detail differences can be determined from the program
* <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
* </p>
* <p>
* In addition to Java compatibility functions, which calculate derived properties,
* this API provides low-level access to the Unicode Character Database.
* </p>
* <p>
* Unicode assigns each code point (not just assigned character) values for
* many properties.
* Most of them are simple boolean flags, or constants from a small enumerated list.
* For some properties, values are strings or other relatively more complex types.
* </p>
* <p>
* For more information see
* "About the Unicode Character Database" (http://www.unicode.org/ucd/)
* and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
* </p>
* <p>
* There are also functions that provide easy migration from C/POSIX functions
* like isblank(). Their use is generally discouraged because the C/POSIX
* standards do not define their semantics beyond the ASCII range, which means
* that different implementations exhibit very different behavior.
* Instead, Unicode properties should be used directly.
* </p>
* <p>
* There are also only a few, broad C/POSIX character classes, and they tend
* to be used for conflicting purposes. For example, the "isalpha()" class
* is sometimes used to determine word boundaries, while a more sophisticated
* approach would at least distinguish initial letters from continuation
* characters (the latter including combining marks).
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
* Another example: There is no "istitle()" class for titlecase characters.
* </p>
* <p>
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
* ICU implements them according to the Standard Recommendations in
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
* </p>
* <p>
* API access for C/POSIX character classes is as follows:
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
* - cntrl: getType(c)==CONTROL
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
* </p>
* <p>
* The C/POSIX character classes are also available in UnicodeSet patterns,
* using patterns like [:graph:] or \p{graph}.
* </p>
* <p>
* Note: There are several ICU (and Java) whitespace functions.
* Comparison:
* - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* - isSpaceChar: just Z (including no-break spaces)
* </p>
* <p>
* This class is not subclassable
* </p>
* @author Syn Wee Quek
* @stable ICU 2.1
* @see com.ibm.icu.lang.UCharacterEnums
*/
public final class UCharacter
{
/**
* Numeric Type constants.
* @see UProperty#NUMERIC_TYPE
* @stable ICU 2.4
*/
public static interface NumericType
{
/**
* @stable ICU 2.4
*/
public static final int DECIMAL = 1;
}
// public data members -----------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* This is a 21-bit value (21 bits, rounded up).<br>
* Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
* @stable ICU 2.1
*/
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
/**
* The minimum value for Supplementary code points
* @stable ICU 2.1
*/
public static final int SUPPLEMENTARY_MIN_VALUE =
UTF16.SUPPLEMENTARY_MIN_VALUE;
// public methods ----------------------------------------------------
/**
* Retrieves the numeric value of a decimal digit code point.
* <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as
* digits.
* This has been changed to conform to the java semantics.
* <br>A code point is a valid digit if and only if:
* <ul>
* <li>ch is a decimal digit or one of the european letters, and
* <li>the value of ch is less than the specified radix.
* </ul>
* @param ch the code point to query
* @param radix the radix
* @return the numeric value represented by the code point in the
* specified radix, or -1 if the code point is not a decimal digit
* or if its value is too large for the radix
* @stable ICU 2.1
*/
public static int digit(int ch, int radix)
{
// when ch is out of bounds getProperty == 0
int props = getProperty(ch);
int value;
if (getNumericType(props) == NumericType.DECIMAL) {
value = UCharacterProperty.getUnsignedValue(props);
} else {
value = getEuropeanDigit(ch);
}
return (0 <= value && value < radix) ? value : -1;
}
/**
* Returns the Bidirection property of a code point.
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
* property.<br>
* Result returned belongs to the interface
* <a href=UCharacterDirection.html>UCharacterDirection</a>
* @param ch the code point to be determined its direction
* @return direction constant from UCharacterDirection.
* @stable ICU 2.1
*/
public static int getDirection(int ch)
{
return gBdp.getClass(ch);
}
/**
* Returns a code point corresponding to the two UTF16 characters.
* @param lead the lead char
* @param trail the trail char
* @return code point if surrogate characters are valid.
* @exception IllegalArgumentException thrown when argument characters do
* not form a valid codepoint
* @stable ICU 2.1
*/
public static int getCodePoint(char lead, char trail)
{
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(lead, trail);
}
throw new IllegalArgumentException("Illegal surrogate characters");
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* @param ch The code point.
* @return the Unicode version number
* @stable ICU 2.6
*/
public static VersionInfo getAge(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
throw new IllegalArgumentException("Codepoint out of bounds");
}
return PROPERTY_.getAge(ch);
}
// private variables -------------------------------------------------
/**
* Database storing the sets of character property
*/
private static final UCharacterProperty PROPERTY_;
/**
* For optimization
*/
private static final char[] PROPERTY_TRIE_INDEX_;
private static final char[] PROPERTY_TRIE_DATA_;
private static final int PROPERTY_INITIAL_VALUE_;
private static final UBiDiProps gBdp;
// block to initialise character property database
static
{
try
{
PROPERTY_ = UCharacterProperty.getInstance();
PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_;
}
catch (Exception e)
{
throw new MissingResourceException(e.getMessage(),"","");
}
UBiDiProps bdp;
try {
bdp=UBiDiProps.getSingleton();
} catch(IOException e) {
bdp=UBiDiProps.getDummy();
}
gBdp=bdp;
}
/**
* Shift to get numeric type
*/
private static final int NUMERIC_TYPE_SHIFT_ = 5;
/**
* Mask to get numeric type
*/
private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
// private methods ---------------------------------------------------
/**
* Getting the digit values of characters like 'A' - 'Z', normal,
* half-width and full-width. This method assumes that the other digit
* characters are checked by the calling method.
* @param ch character to test
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
* its corresponding digit will be returned.
*/
private static int getEuropeanDigit(int ch) {
if ((ch > 0x7a && ch < 0xff21)
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
return -1;
}
if (ch <= 0x7a) {
// ch >= 0x41 or ch < 0x61
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
}
// ch >= 0xff21
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
}
// ch >= 0xff41 && ch <= 0xff5a
return ch + 10 - 0xff41;
}
/**
* Gets the numeric type of the property argument
* @param props 32 bit property
* @return the numeric type
*/
private static int getNumericType(int props)
{
return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
}
/**
* Gets the property value at the index.
* This is optimized.
* Note this is alittle different from CharTrie the index m_trieData_
* is never negative.
* This is a duplicate of UCharacterProperty.getProperty. For optimization
* purposes, this method calls the trie data directly instead of through
* UCharacterProperty.getProperty.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
* @stable ICU 2.6
*/
private static final int getProperty(int ch)
{
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
// BMP codepoint 0000..D7FF or DC00..FFFF
try { // using try for ch < 0 is faster than using an if statement
return PROPERTY_TRIE_DATA_[
(PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
+ (ch & 0x1f)];
} catch (ArrayIndexOutOfBoundsException e) {
return PROPERTY_INITIAL_VALUE_;
}
}
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
// lead surrogate D800..DBFF
return PROPERTY_TRIE_DATA_[
(PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
+ (ch & 0x1f)];
}
// for optimization
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
// supplementary code point 10000..10FFFF
// look at the construction of supplementary characters
// trail forms the ends of it.
return PROPERTY_.m_trie_.getSurrogateValue(
UTF16.getLeadSurrogate(ch),
(char)(ch & 0x3ff));
}
// return m_dataOffset_ if there is an error, in this case we return
// the default value: m_initialValue_
// we cannot assume that m_initialValue_ is at offset 0
// this is for optimization.
return PROPERTY_INITIAL_VALUE_;
}
}

View File

@@ -0,0 +1,292 @@
/*
* Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
/**
* Abstract class that defines an API for iteration on text objects.This is an
* interface for forward and backward iteration and random access into a text
* object. Forward iteration is done with post-increment and backward iteration
* is done with pre-decrement semantics, while the
* <code>java.text.CharacterIterator</code> interface methods provided forward
* iteration with "pre-increment" and backward iteration with pre-decrement
* semantics. This API is more efficient for forward iteration over code points.
* The other major difference is that this API can do both code unit and code point
* iteration, <code>java.text.CharacterIterator</code> can only iterate over
* code units and is limited to BMP (0 - 0xFFFF)
* @author Ram
* @stable ICU 2.4
*/
public abstract class UCharacterIterator
implements Cloneable {
/**
* Protected default constructor for the subclasses
* @stable ICU 2.4
*/
protected UCharacterIterator(){
}
/**
* Indicator that we have reached the ends of the UTF16 text.
* Moved from UForwardCharacterIterator.java
* @stable ICU 2.4
*/
public static final int DONE = -1;
// static final methods ----------------------------------------------------
/**
* Returns a <code>UCharacterIterator</code> object given a
* source string.
* @param source a string
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(String source){
return new ReplaceableUCharacterIterator(source);
}
//// for StringPrep
/**
* Returns a <code>UCharacterIterator</code> object given a
* source StringBuffer.
* @param source an string buffer of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(StringBuffer source){
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a
* CharacterIterator.
* @param source a valid CharacterIterator object.
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(CharacterIterator source){
return new CharacterIteratorWrapper(source);
}
// public methods ----------------------------------------------------------
/**
* Returns the code unit at the current index. If index is out
* of range, returns DONE. Index is not changed.
* @return current code unit
* @stable ICU 2.4
*/
public abstract int current();
/**
* Returns the length of the text
* @return length of the text
* @stable ICU 2.4
*/
public abstract int getLength();
/**
* Gets the current index in text.
* @return current index in text.
* @stable ICU 2.4
*/
public abstract int getIndex();
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
* range, DONE is returned, and the iterator is reset to the limit
* of the text.
* @return the next UTF16 code unit, or DONE if the index is at the limit
* of the text.
* @stable ICU 2.4
*/
public abstract int next();
/**
* Returns the code point at index, and increments to the next code
* point (post-increment semantics). If index does not point to a
* valid surrogate pair, the behavior is the same as
* <code>next()</code>. Otherwise the iterator is incremented past
* the surrogate pair, and the code point represented by the pair
* is returned.
* @return the next codepoint in text, or DONE if the index is at
* the limit of the text.
* @stable ICU 2.4
*/
public int nextCodePoint(){
int ch1 = next();
if(UTF16.isLeadSurrogate((char)ch1)){
int ch2 = next();
if(UTF16.isTrailSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch1,
(char)ch2);
}else if (ch2 != DONE) {
// unmatched surrogate so back out
previous();
}
}
return ch1;
}
/**
* Decrement to the position of the previous code unit in the
* text, and return it (pre-decrement semantics). If the
* resulting index is less than 0, the index is reset to 0 and
* DONE is returned.
* @return the previous code unit in the text, or DONE if the new
* index is before the start of the text.
* @stable ICU 2.4
*/
public abstract int previous();
/**
* Sets the index to the specified index in the text.
* @param index the index within the text.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
* @stable ICU 2.4
*/
public abstract void setIndex(int index);
//// for StringPrep
/**
* Fills the buffer with the underlying text storage of the iterator
* If the buffer capacity is not enough a exception is thrown. The capacity
* of the fill in buffer should at least be equal to length of text in the
* iterator obtained by calling <code>getLength()</code>.
* <b>Usage:</b>
*
* <code>
* <pre>
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
* char[] buf = new char[iter.getLength()];
* iter.getText(buf);
*
* OR
* char[] buf= new char[1];
* int len = 0;
* for(;;){
* try{
* len = iter.getText(buf);
* break;
* }catch(IndexOutOfBoundsException e){
* buf = new char[iter.getLength()];
* }
* }
* </pre>
* </code>
*
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBounds exception if there is not enough
* room after offset in the array, or if offset < 0.
* @stable ICU 2.4
*/
public abstract int getText(char[] fillIn, int offset);
//// for StringPrep
/**
* Convenience override for <code>getText(char[], int)</code> that provides
* an offset of 0.
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBounds exception if there is not enough
* room in the array.
* @stable ICU 2.4
*/
public final int getText(char[] fillIn) {
return getText(fillIn, 0);
}
//// for StringPrep
/**
* Convenience method for returning the underlying text storage as as string
* @return the underlying text storage in the iterator as a string
* @stable ICU 2.4
*/
public String getText() {
char[] text = new char[getLength()];
getText(text);
return new String(text);
}
/**
* Moves the current position by the number of code units
* specified, either forward or backward depending on the sign
* of delta (positive or negative respectively). If the resulting
* index would be less than zero, the index is set to zero, and if
* the resulting index would be greater than limit, the index is
* set to limit.
*
* @param delta the number of code units to move the current
* index.
* @return the new index.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
* @stable ICU 2.4
*
*/
public int moveIndex(int delta) {
int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
setIndex(x);
return x;
}
/**
* Creates a copy of this iterator, independent from other iterators.
* If it is not possible to clone the iterator, returns null.
* @return copy of this iterator
* @stable ICU 2.4
*/
public Object clone() throws CloneNotSupportedException{
return super.clone();
}
}

View File

@@ -0,0 +1,369 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.MissingResourceException;
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
public final class UCharacterProperty
{
// public data members -----------------------------------------------
/**
* Trie data
*/
public CharTrie m_trie_;
/**
* Optimization
* CharTrie index array
*/
public char[] m_trieIndex_;
/**
* Optimization
* CharTrie data array
*/
public char[] m_trieData_;
/**
* Optimization
* CharTrie data offset
*/
public int m_trieInitialValue_;
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
// uprops.h enum UPropertySource --------------------------------------- ***
/** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
/** One more than the highest UPropertySource (SRC_) constant. */
public static final int SRC_COUNT=9;
// public methods ----------------------------------------------------
/**
* Java friends implementation
*/
public void setIndexData(CharTrie.FriendAgent friendagent)
{
m_trieIndex_ = friendagent.getPrivateIndex();
m_trieData_ = friendagent.getPrivateData();
m_trieInitialValue_ = friendagent.getPrivateInitialValue();
}
/**
* Gets the property value at the index.
* This is optimized.
* Note this is alittle different from CharTrie the index m_trieData_
* is never negative.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
// BMP codepoint 0000..D7FF or DC00..FFFF
// optimized
try { // using try for ch < 0 is faster than using an if statement
return m_trieData_[
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
<< Trie.INDEX_STAGE_2_SHIFT_)
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
} catch (ArrayIndexOutOfBoundsException e) {
return m_trieInitialValue_;
}
}
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
// lead surrogate D800..DBFF
return m_trieData_[
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
<< Trie.INDEX_STAGE_2_SHIFT_)
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
}
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
// supplementary code point 10000..10FFFF
// look at the construction of supplementary characters
// trail forms the ends of it.
return m_trie_.getSurrogateValue(
UTF16.getLeadSurrogate(ch),
(char)(ch & Trie.SURROGATE_MASK_));
}
// ch is out of bounds
// return m_dataOffset_ if there is an error, in this case we return
// the default value: m_initialValue_
// we cannot assume that m_initialValue_ is at offset 0
// this is for optimization.
return m_trieInitialValue_;
// this all is an inlined form of return m_trie_.getCodePointValue(ch);
}
/**
* Getting the unsigned numeric value of a character embedded in the property
* argument
* @param prop the character
* @return unsigned numberic value
*/
public static int getUnsignedValue(int prop)
{
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
}
/**
* Gets the unicode additional properties.
* C version getUnicodeProperties.
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column
* @return unicode properties
*/
public int getAdditional(int codepoint, int column) {
if (column == -1) {
return getProperty(codepoint);
}
if (column < 0 || column >= m_additionalColumnsCount_) {
return 0;
}
return m_additionalVectors_[
m_additionalTrie_.getCodePointValue(codepoint) + column];
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
}
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
/**
* Loads the property data and initialize the UCharacterProperty instance.
* @throws MissingResourceException when data is missing or data has been corrupted
*/
public static UCharacterProperty getInstance()
{
if(INSTANCE_ == null) {
try {
INSTANCE_ = new UCharacterProperty();
}
catch (Exception e) {
throw new MissingResourceException(e.getMessage(),"","");
}
}
return INSTANCE_;
}
/**
* Checks if the argument c is to be treated as a white space in ICU
* rules. Usually ICU rule white spaces are ignored unless quoted.
* Equivalent to test for Pattern_White_Space Unicode property.
* Stable set of characters, won't change.
* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
* @param c codepoint to check
* @return true if c is a ICU white space
*/
public static boolean isRuleWhiteSpace(int c)
{
/* "white space" in the sense of ICU rule parsers
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
Equivalent to test for Pattern_White_Space Unicode property.
*/
return (c >= 0x0009 && c <= 0x2029 &&
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
c == 0x200E || c == 0x200F || c >= 0x2028));
}
// protected variables -----------------------------------------------
/**
* Extra property trie
*/
CharTrie m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
// private variables -------------------------------------------------
/**
* UnicodeData.txt property object
*/
private static UCharacterProperty INSTANCE_ = null;
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE_ = 25000;
/**
* Numeric value shift
*/
private static final int VALUE_SHIFT_ = 8;
/**
* Mask to be applied after shifting to obtain an unsigned numeric value
*/
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Offset to add to combined surrogate pair to avoid msking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
(UTF16.SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
// additional properties ----------------------------------------------
/**
* First nibble shift
*/
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
/**
* Second nibble mask
*/
private static final int LAST_NIBBLE_MASK_ = 0xF;
/**
* Age value shift
*/
private static final int AGE_SHIFT_ = 24;
// private constructors --------------------------------------------------
/**
* Constructor
* @exception IOException thrown when data reading fails or data corrupted
*/
private UCharacterProperty() throws IOException
{
// jar access
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
reader.read(this);
b.close();
m_trie_.putIndexData(this);
}
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
while(propsVectorsIter.next(propsVectorsResult)){
set.add(propsVectorsResult.start);
}
}
}
}

View File

@@ -0,0 +1,190 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* <p>Internal reader class for ICU data file uprops.icu containing
* Unicode codepoint data.</p>
* <p>This class simply reads uprops.icu, authenticates that it is a valid
* ICU data file and split its contents up into blocks of data for use in
* <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>.
* </p>
* <p>uprops.icu which is in big-endian format is jared together with this
* package.</p>
*
* Unicode character properties file format see
* (ICU4C)/source/tools/genprops/store.c
*
* @author Syn Wee Quek
* @since release 2.1, February 1st 2002
*/
final class UCharacterPropertyReader implements ICUBinary.Authenticate
{
// public methods ----------------------------------------------------
public boolean isDataVersionAcceptable(byte version[])
{
return version[0] == DATA_FORMAT_VERSION_[0]
&& version[2] == DATA_FORMAT_VERSION_[2]
&& version[3] == DATA_FORMAT_VERSION_[3];
}
// protected constructor ---------------------------------------------
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
*/
protected UCharacterPropertyReader(InputStream inputStream)
throws IOException
{
m_unicodeVersion_ = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
this);
m_dataInputStream_ = new DataInputStream(inputStream);
}
// protected methods -------------------------------------------------
/**
* <p>Reads uprops.icu, parse it into blocks of data to be stored in
* UCharacterProperty.</P
* @param ucharppty UCharacterProperty instance
* @exception IOException thrown when data reading fails
*/
protected void read(UCharacterProperty ucharppty) throws IOException
{
// read the indexes
int count = INDEX_SIZE_;
m_propertyOffset_ = m_dataInputStream_.readInt();
count --;
m_exceptionOffset_ = m_dataInputStream_.readInt();
count --;
m_caseOffset_ = m_dataInputStream_.readInt();
count --;
m_additionalOffset_ = m_dataInputStream_.readInt();
count --;
m_additionalVectorsOffset_ = m_dataInputStream_.readInt();
count --;
m_additionalColumnsCount_ = m_dataInputStream_.readInt();
count --;
m_reservedOffset_ = m_dataInputStream_.readInt();
count --;
m_dataInputStream_.skipBytes(3 << 2);
count -= 3;
ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt();
count --; // 10
ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt();
count --; // 11
m_dataInputStream_.skipBytes(count << 2);
// read the trie index block
// m_props_index_ in terms of ints
ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, null);
// skip the 32 bit properties block
int size = m_exceptionOffset_ - m_propertyOffset_;
m_dataInputStream_.skipBytes(size * 4);
// reads the 32 bit exceptions block
size = m_caseOffset_ - m_exceptionOffset_;
m_dataInputStream_.skipBytes(size * 4);
// reads the 32 bit case block
size = (m_additionalOffset_ - m_caseOffset_) << 1;
m_dataInputStream_.skipBytes(size * 2);
if(m_additionalColumnsCount_ > 0) {
// reads the additional property block
ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_, null);
// additional properties
size = m_reservedOffset_ - m_additionalVectorsOffset_;
ucharppty.m_additionalVectors_ = new int[size];
for (int i = 0; i < size; i ++) {
ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt();
}
}
m_dataInputStream_.close();
ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_;
ucharppty.m_unicodeVersion_ = VersionInfo.getInstance(
(int)m_unicodeVersion_[0], (int)m_unicodeVersion_[1],
(int)m_unicodeVersion_[2], (int)m_unicodeVersion_[3]);
}
// private variables -------------------------------------------------
/**
* Index size
*/
private static final int INDEX_SIZE_ = 16;
/**
* ICU data file input stream
*/
private DataInputStream m_dataInputStream_;
/**
* Offset information in the indexes.
*/
private int m_propertyOffset_;
private int m_exceptionOffset_;
private int m_caseOffset_;
private int m_additionalOffset_;
private int m_additionalVectorsOffset_;
private int m_additionalColumnsCount_;
private int m_reservedOffset_;
private byte m_unicodeVersion_[];
/**
* Data format "UPro".
*/
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50,
(byte)0x72, (byte)0x6F};
/**
* Format version; this code works with all versions with the same major
* version number and the same Trie bit distribution.
*/
private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x5, (byte)0,
(byte)Trie.INDEX_STAGE_1_SHIFT_,
(byte)Trie.INDEX_STAGE_2_SHIFT_};
}

View File

@@ -0,0 +1,538 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>Standalone utility class providing UTF16 character conversions and
* indexing conversions.</p>
* <p>Code that uses strings alone rarely need modification.
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
* operation. Similarly, concatenation is always safe. Substringing is safe if
* the start and end are both on UTF-32 boundaries. In normal code, the values
* for start and end are on those boundaries, since they arose from operations
* like searching. If not, the nearest UTF-32 boundaries can be determined
* using <code>bounds()</code>.</p>
* <strong>Examples:</strong>
* <p>The following examples illustrate use of some of these methods.
* <pre>
* // iteration forwards: Original
* for (int i = 0; i &lt; s.length(); ++i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration forwards: Changes for UTF-32
* int ch;
* for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s,i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Original
* for (int i = s.length() -1; i >= 0; --i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Changes for UTF-32
* int ch;
* for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s,i);
* doSomethingWith(ch);
* }
* </pre>
* <strong>Notes:</strong>
* <ul>
* <li>
* <strong>Naming:</strong> For clarity, High and Low surrogates are called
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
* sense of their ordering in a string. <code>offset16</code> and
* <code>offset32</code> are used to distinguish offsets to UTF-16
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
* used to contain UTF-32 characters, as opposed to <code>char16</code>,
* which is a UTF-16 code unit.
* </li>
* <li>
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
* </li>
* <li>
* <strong>Exceptions:</strong> The error checking will throw an exception
* if indices are out of bounds. Other than than that, all methods will
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
* values are present. <code>UCharacter.isLegal()</code> can be used to check
* for validity if desired.
* </li>
* <li>
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
* surrogates, then these are counted as one UTF-32 value. This matches
* their iteration behavior, which is vital. It also matches common display
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
* </li>
* <li>
* <strong>Optimization:</strong> The method implementations may need
* optimization if the compiler doesn't fold static final methods. Since
* surrogate pairs will form an exceeding small percentage of all the text
* in the world, the singleton case should always be optimized for.
* </li>
* </ul>
* @author Mark Davis, with help from Markus Scherer
* @stable ICU 2.1
*/
public final class UTF16
{
// public variables ---------------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int CODEPOINT_MIN_VALUE = 0;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* @stable ICU 2.1
*/
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
/**
* The minimum value for Supplementary code points
* @stable ICU 2.1
*/
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Lead surrogate minimum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
/**
* Trail surrogate minimum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
/**
* Lead surrogate maximum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
/**
* Trail surrogate maximum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
/**
* Surrogate minimum value
* @stable ICU 2.1
*/
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
// public method ------------------------------------------------------
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">
* UCharacter.isLegal()</a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of
* bounds.
* @stable ICU 2.1
*/
public static int charAt(String source, int offset16) {
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(String source, int offset16, char single) {
if (single > TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a substring.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param start offset to substring in the source array for analyzing
* @param limit offset to substring in the source array for analyzing
* @param offset16 UTF-16 offset relative to start
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is not within
* the range of start and limit.
* @stable ICU 2.1
*/
public static int charAt(char source[], int start, int limit,
int offset16)
{
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
offset16 ++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
offset16 --;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
}
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use <code>
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
* @param char32 the input codepoint.
* @return 2 if is in supplementary space, otherwise 1.
* @stable ICU 2.1
*/
public static int getCharCount(int char32)
{
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
return 1;
}
return 2;
}
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
* @return true iff the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
char16 <= TRAIL_SURROGATE_MAX_VALUE;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
* @return true iff the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
char16 <= TRAIL_SURROGATE_MAX_VALUE);
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
* @return true iff the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
return LEAD_SURROGATE_MIN_VALUE <= char16 &&
char16 <= LEAD_SURROGATE_MAX_VALUE;
}
/**
* Returns the lead surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return lead surrogate if the getCharCount(ch) is 2; <br>
* and 0 otherwise (note: 0 is not a valid lead surrogate).
* @stable ICU 2.1
*/
public static char getLeadSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(LEAD_SURROGATE_OFFSET_ +
(char32 >> LEAD_SURROGATE_SHIFT_));
}
return 0;
}
/**
* Returns the trail surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
* the character itself
* @stable ICU 2.1
*/
public static char getTrailSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(TRAIL_SURROGATE_MIN_VALUE +
(char32 & TRAIL_SURROGATE_MASK_));
}
return (char)char32;
}
/**
* Convenience method corresponding to String.valueOf(char). Returns a one
* or two char string containing the UTF-32 value in UTF16 format. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return string value of char32 in UTF16 format
* @exception IllegalArgumentException thrown if char32 is a invalid
* codepoint.
* @stable ICU 2.1
*/
public static String valueOf(int char32)
{
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint");
}
return toString(char32);
}
/**
* Append a single UTF-32 value to the end of a StringBuffer.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param target the buffer to append to
* @param char32 value to append.
* @return the updated StringBuffer
* @exception IllegalArgumentException thrown when char32 does not lie
* within the range of the Unicode codepoints
* @stable ICU 2.1
*/
public static StringBuffer append(StringBuffer target, int char32)
{
// Check for irregular values
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
}
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
{
target.append(getLeadSurrogate(char32));
target.append(getTrailSurrogate(char32));
}
else {
target.append((char)char32);
}
return target;
}
//// for StringPrep
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
* @param source char array
* @param start position of the subarray to be performed on
* @param limit position of the subarray to be performed on
* @param offset16 UTF16 position to shift relative to start
* @param shift32 number of codepoints to shift
* @return new shifted offset16 relative to start
* @exception IndexOutOfBoundsException if the new offset16 is out of
* bounds with respect to the subarray or the subarray bounds
* are out of range.
* @stable ICU 2.1
*/
public static int moveCodePointOffset(char source[], int start, int limit,
int offset16, int shift32)
{
int size = source.length;
int count;
char ch;
int result = offset16 + start;
if (start<0 || limit<start) {
throw new StringIndexOutOfBoundsException(start);
}
if (limit>size) {
throw new StringIndexOutOfBoundsException(limit);
}
if (offset16<0 || result>limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
if (shift32 > 0 ) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
count = shift32;
while (result < limit && count > 0)
{
ch = source[result];
if (isLeadSurrogate(ch) && (result+1 < limit) &&
isTrailSurrogate(source[result+1])) {
result ++;
}
count --;
result ++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
for (count=-shift32; count>0; count--) {
result--;
if (result<start) {
break;
}
ch = source[result];
if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
result--;
}
}
}
if (count != 0) {
throw new StringIndexOutOfBoundsException(shift32);
}
result -= start;
return result;
}
// private data members -------------------------------------------------
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Mask to retrieve the significant value from a trail surrogate.
*/
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
/**
* Value that all lead surrogate starts with
*/
private static final int LEAD_SURROGATE_OFFSET_ =
LEAD_SURROGATE_MIN_VALUE -
(SUPPLEMENTARY_MIN_VALUE
>> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
/**
* <p>Converts argument code point and returns a String object representing
* the code point's value in UTF16 format.</p>
* <p>This method does not check for the validity of the codepoint, the
* results are not guaranteed if a invalid codepoint is passed as
* argument.</p>
* <p>The result is a string whose length is 1 for non-supplementary code
* points, 2 otherwise.</p>
* @param ch code point
* @return string representation of the code point
*/
private static String toString(int ch)
{
if (ch < SUPPLEMENTARY_MIN_VALUE) {
return String.valueOf((char)ch);
}
StringBuffer result = new StringBuffer();
result.append(getLeadSurrogate(ch));
result.append(getTrailSurrogate(ch));
return result.toString();
}
}

View File

@@ -0,0 +1,58 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>UnicodeMatcher</code> defines a protocol for objects that can
* match a range of characters in a Replaceable string.
* @stable ICU 2.0
*/
public interface UnicodeMatcher {
/**
* The character at index i, where i < contextStart || i >= contextLimit,
* is ETHER. This allows explicit matching by rules and UnicodeSets
* of text outside the context. In traditional terms, this allows anchoring
* at the start and/or end.
* @stable ICU 2.0
*/
static final char ETHER = '\uFFFF';
}
//eof

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,219 @@
/*
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.util.Iterator;
/**
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
* multicharacter strings of the UnicodSet, if any.
*
* <p>To iterate over code points, use a loop like this:
* <pre>
* UnicodeSetIterator it(set);
* while (set.next()) {
* if (set.codepoint != UnicodeSetIterator::IS_STRING) {
* processCodepoint(set.codepoint);
* } else {
* processString(set.string);
* }
* }
* </pre>
*
* <p>To iterate over code point ranges, use a loop like this:
* <pre>
* UnicodeSetIterator it(set);
* while (set.nextRange()) {
* if (set.codepoint != UnicodeSetIterator::IS_STRING) {
* processCodepointRange(set.codepoint, set.codepointEnd);
* } else {
* processString(set.string);
* }
* }
* </pre>
* @author M. Davis
* @stable ICU 2.0
*/
public class UnicodeSetIterator {
/**
* Value of <tt>codepoint</tt> if the iterator points to a string.
* If <tt>codepoint == IS_STRING</tt>, then examine
* <tt>string</tt> for the current iteration result.
* @stable ICU 2.0
*/
public static int IS_STRING = -1;
/**
* Current code point, or the special value <tt>IS_STRING</tt>, if
* the iterator points to a string.
* @stable ICU 2.0
*/
public int codepoint;
/**
* When iterating over ranges using <tt>nextRange()</tt>,
* <tt>codepointEnd</tt> contains the inclusive end of the
* iteration range, if <tt>codepoint != IS_STRING</tt>. If
* iterating over code points using <tt>next()</tt>, or if
* <tt>codepoint == IS_STRING</tt>, then the value of
* <tt>codepointEnd</tt> is undefined.
* @stable ICU 2.0
*/
public int codepointEnd;
/**
* If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points
* to the current string. If <tt>codepoint != IS_STRING</tt>, the
* value of <tt>string</tt> is undefined.
* @stable ICU 2.0
*/
public String string;
/**
* Create an iterator over the given set.
* @param set set to iterate over
* @stable ICU 2.0
*/
public UnicodeSetIterator(UnicodeSet set) {
reset(set);
}
/**
* Returns the next element in the set, either a code point range
* or a string. If there are no more elements in the set, return
* false. If <tt>codepoint == IS_STRING</tt>, the value is a
* string in the <tt>string</tt> field. Otherwise the value is a
* range of one or more code points from <tt>codepoint</tt> to
* <tt>codepointeEnd</tt> inclusive.
*
* <p>The order of iteration is all code points ranges in sorted
* order, followed by all strings sorted order. Ranges are
* disjoint and non-contiguous. <tt>string</tt> is undefined
* unless <tt>codepoint == IS_STRING</tt>. Do not mix calls to
* <tt>next()</tt> and <tt>nextRange()</tt> without calling
* <tt>reset()</tt> between them. The results of doing so are
* undefined.
*
* @return true if there was another element in the set and this
* object contains the element.
* @stable ICU 2.0
*/
public boolean nextRange() {
if (nextElement <= endElement) {
codepointEnd = endElement;
codepoint = nextElement;
nextElement = endElement+1;
return true;
}
if (range < endRange) {
loadRange(++range);
codepointEnd = endElement;
codepoint = nextElement;
nextElement = endElement+1;
return true;
}
// stringIterator == null iff there are no string elements remaining
if (stringIterator == null) return false;
codepoint = IS_STRING; // signal that value is actually a string
string = stringIterator.next();
if (!stringIterator.hasNext()) stringIterator = null;
return true;
}
/**
* Sets this iterator to visit the elements of the given set and
* resets it to the start of that set. The iterator is valid only
* so long as <tt>set</tt> is valid.
* @param set the set to iterate over.
* @stable ICU 2.0
*/
public void reset(UnicodeSet uset) {
set = uset;
reset();
}
/**
* Resets this iterator to the start of the set.
* @stable ICU 2.0
*/
public void reset() {
endRange = set.getRangeCount() - 1;
range = 0;
endElement = -1;
nextElement = 0;
if (endRange >= 0) {
loadRange(range);
}
stringIterator = null;
if (set.strings != null) {
stringIterator = set.strings.iterator();
if (!stringIterator.hasNext()) stringIterator = null;
}
}
// ======================= PRIVATES ===========================
private UnicodeSet set;
private int endRange = 0;
private int range = 0;
/**
* @internal
*/
protected int endElement;
/**
* @internal
*/
protected int nextElement;
private Iterator<String> stringIterator = null;
/**
* Invariant: stringIterator is null when there are no (more) strings remaining
*/
/**
* @internal
*/
protected void loadRange(int aRange) {
nextElement = set.getRangeStart(aRange);
endElement = set.getRangeEnd(aRange);
}
}

View File

@@ -0,0 +1,385 @@
/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
public final class Utility {
/**
* Convenience utility to compare two Object[]s
* Ought to be in System.
* @param len the length to compare.
* The start indices and start+len must be valid.
*/
public final static boolean arrayRegionMatches(char[] source, int sourceStart,
char[] target, int targetStart,
int len)
{
int sourceEnd = sourceStart + len;
int delta = targetStart - sourceStart;
for (int i = sourceStart; i < sourceEnd; i++) {
if (source[i]!=target[i + delta])
return false;
}
return true;
}
/**
* Convert characters outside the range U+0020 to U+007F to
* Unicode escapes, and convert backslash to a double backslash.
*/
public static final String escape(String s) {
StringBuffer buf = new StringBuffer();
for (int i=0; i<s.length(); ) {
int c = UTF16.charAt(s, i);
i += UTF16.getCharCount(c);
if (c >= ' ' && c <= 0x007F) {
if (c == '\\') {
buf.append("\\\\"); // That is, "\\"
} else {
buf.append((char)c);
}
} else {
boolean four = c <= 0xFFFF;
buf.append(four ? "\\u" : "\\U");
hex(c, four ? 4 : 8, buf);
}
}
return buf.toString();
}
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
static private final char[] UNESCAPE_MAP = {
/*" 0x22, 0x22 */
/*' 0x27, 0x27 */
/*? 0x3F, 0x3F */
/*\ 0x5C, 0x5C */
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*e*/ 0x65, 0x1b,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* to parallel the icu4c unescapeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
int c;
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
boolean braces = false;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
/* Fetch first UChar after '\\' */
c = UTF16.charAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
switch (c) {
case 'u':
minDig = maxDig = 4;
break;
case 'U':
minDig = maxDig = 8;
break;
case 'x':
minDig = 1;
if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
++offset;
braces = true;
maxDig = 8;
} else {
maxDig = 2;
}
break;
default:
dig = UCharacter.digit(c, 8);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
}
break;
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
c = UTF16.charAt(s, offset);
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
offset += UTF16.getCharCount(c);
++n;
}
if (n < minDig) {
return -1;
}
if (braces) {
if (c != 0x7D /*}*/) {
return -1;
}
++offset;
}
if (result < 0 || result >= 0x110000) {
return -1;
}
// If an escape sequence specifies a lead surrogate, see
// if there is a trail surrogate after it, either as an
// escape or as a literal. If so, join them up into a
// supplementary.
if (offset < length &&
UTF16.isLeadSurrogate((char) result)) {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
int o[] = new int[] { ahead };
c = unescapeAt(s, o);
ahead = o[0];
}
if (UTF16.isTrailSurrogate((char) c)) {
offset = ahead;
result = UCharacterProperty.getRawSupplementary(
(char) result, (char) c);
}
}
offset16[0] = offset;
return result;
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
break;
}
}
/* Map \cX to control-X: X & 0x1F */
if (c == 'c' && offset < length) {
c = UTF16.charAt(s, offset);
offset16[0] = offset + UTF16.getCharCount(c);
return 0x1F & c;
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
}
/**
* Convert a integer to size width hex uppercase digits.
* E.g., hex('a', 4, str) => "0041".
* Append the output to the given StringBuffer.
* If width is too small to fit, nothing will be appended to output.
*/
public static StringBuffer hex(int ch, int width, StringBuffer output) {
return appendNumber(output, ch, 16, width);
}
/**
* Convert a integer to size width (minimum) hex uppercase digits.
* E.g., hex('a', 4, str) => "0041". If the integer requires more
* than width digits, more will be used.
*/
public static String hex(int ch, int width) {
StringBuffer buf = new StringBuffer();
return appendNumber(buf, ch, 16, width).toString();
}
/**
* Skip over a sequence of zero or more white space characters
* at pos. Return the index of the first non-white-space character
* at or after pos, or str.length(), if there is none.
*/
public static int skipWhitespace(String str, int pos) {
while (pos < str.length()) {
int c = UTF16.charAt(str, pos);
if (!UCharacterProperty.isRuleWhiteSpace(c)) {
break;
}
pos += UTF16.getCharCount(c);
}
return pos;
}
static final char DIGITS[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z'
};
/**
* Append the digits of a positive integer to the given
* <code>StringBuffer</code> in the given radix. This is
* done recursively since it is easiest to generate the low-
* order digit first, but it must be appended last.
*
* @param result is the <code>StringBuffer</code> to append to
* @param n is the positive integer
* @param radix is the radix, from 2 to 36 inclusive
* @param minDigits is the minimum number of digits to append.
*/
private static void recursiveAppendNumber(StringBuffer result, int n,
int radix, int minDigits)
{
int digit = n % radix;
if (n >= radix || minDigits > 1) {
recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
}
result.append(DIGITS[digit]);
}
/**
* Append a number to the given StringBuffer in the given radix.
* Standard digits '0'-'9' are used and letters 'A'-'Z' for
* radices 11 through 36.
* @param result the digits of the number are appended here
* @param n the number to be converted to digits; may be negative.
* If negative, a '-' is prepended to the digits.
* @param radix a radix from 2 to 36 inclusive.
* @param minDigits the minimum number of digits, not including
* any '-', to produce. Values less than 2 have no effect. One
* digit is always emitted regardless of this parameter.
* @return a reference to result
*/
public static StringBuffer appendNumber(StringBuffer result, int n,
int radix, int minDigits)
throws IllegalArgumentException
{
if (radix < 2 || radix > 36) {
throw new IllegalArgumentException("Illegal radix " + radix);
}
int abs = n;
if (n < 0) {
abs = -n;
result.append("-");
}
recursiveAppendNumber(result, abs, radix, minDigits);
return result;
}
/**
* Return true if the character is NOT printable ASCII. The tab,
* newline and linefeed characters are considered unprintable.
*/
public static boolean isUnprintable(int c) {
return !(c >= 0x20 && c <= 0x7E);
}
/**
* Escape unprintable characters using <backslash>uxxxx notation
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
* above. If the character is printable ASCII, then do nothing
* and return FALSE. Otherwise, append the escaped notation and
* return TRUE.
*/
public static boolean escapeUnprintable(StringBuffer result, int c) {
if (isUnprintable(c)) {
result.append('\\');
if ((c & ~0xFFFF) != 0) {
result.append('U');
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append('u');
}
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return true;
}
return false;
}
/**
* Similar to StringBuffer.getChars, version 1.3.
* Since JDK 1.2 implements StringBuffer.getChars differently, this method
* is here to provide consistent results.
* To be removed after JDK 1.2 ceased to be the reference platform.
* @param src source string buffer
* @param srcBegin offset to the start of the src to retrieve from
* @param srcEnd offset to the end of the src to retrieve from
* @param dst char array to store the retrieved chars
* @param dstBegin offset to the start of the destination char array to
* store the retrieved chars
*/
public static void getChars(StringBuffer src, int srcBegin, int srcEnd,
char dst[], int dstBegin)
{
if (srcBegin == srcEnd) {
return;
}
src.getChars(srcBegin, srcEnd, dst, dstBegin);
}
}

View File

@@ -0,0 +1,185 @@
/*
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.util.HashMap;
/**
* Class to store version numbers of the form major.minor.milli.micro.
* @author synwee
* @stable ICU 2.6
*/
public final class VersionInfo
{
// public methods ------------------------------------------------------
/**
* Returns an instance of VersionInfo with the argument version.
* @param version version String in the format of "major.minor.milli.micro"
* or "major.minor.milli" or "major.minor" or "major",
* where major, minor, milli, micro are non-negative numbers
* <= 255. If the trailing version numbers are
* not specified they are taken as 0s. E.g. Version "3.1" is
* equivalent to "3.1.0.0".
* @return an instance of VersionInfo with the argument version.
* @exception throws an IllegalArgumentException when the argument version
* is not in the right format
* @stable ICU 2.6
*/
public static VersionInfo getInstance(String version)
{
int length = version.length();
int array[] = {0, 0, 0, 0};
int count = 0;
int index = 0;
while (count < 4 && index < length) {
char c = version.charAt(index);
if (c == '.') {
count ++;
}
else {
c -= '0';
if (c < 0 || c > 9) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
array[count] *= 10;
array[count] += c;
}
index ++;
}
if (index != length) {
throw new IllegalArgumentException(
"Invalid version number: String '" + version + "' exceeds version format");
}
for (int i = 0; i < 4; i ++) {
if (array[i] < 0 || array[i] > 255) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
}
return getInstance(array[0], array[1], array[2], array[3]);
}
/**
* Returns an instance of VersionInfo with the argument version.
* @param major major version, non-negative number <= 255.
* @param minor minor version, non-negative number <= 255.
* @param milli milli version, non-negative number <= 255.
* @param micro micro version, non-negative number <= 255.
* @exception throws an IllegalArgumentException when either arguments are
* negative or > 255
* @stable ICU 2.6
*/
public static VersionInfo getInstance(int major, int minor, int milli,
int micro)
{
// checks if it is in the hashmap
// else
if (major < 0 || major > 255 || minor < 0 || minor > 255 ||
milli < 0 || milli > 255 || micro < 0 || micro > 255) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
int version = getInt(major, minor, milli, micro);
Integer key = Integer.valueOf(version);
Object result = MAP_.get(key);
if (result == null) {
result = new VersionInfo(version);
MAP_.put(key, result);
}
return (VersionInfo)result;
}
/**
* Compares other with this VersionInfo.
* @param other VersionInfo to be compared
* @return 0 if the argument is a VersionInfo object that has version
* information equals to this object.
* Less than 0 if the argument is a VersionInfo object that has
* version information greater than this object.
* Greater than 0 if the argument is a VersionInfo object that
* has version information less than this object.
* @stable ICU 2.6
*/
public int compareTo(VersionInfo other)
{
return m_version_ - other.m_version_;
}
// private data members ----------------------------------------------
/**
* Version number stored as a byte for each of the major, minor, milli and
* micro numbers in the 32 bit int.
* Most significant for the major and the least significant contains the
* micro numbers.
*/
private int m_version_;
/**
* Map of singletons
*/
private static final HashMap<Integer, Object> MAP_ = new HashMap<>();
/**
* Error statement string
*/
private static final String INVALID_VERSION_NUMBER_ =
"Invalid version number: Version number may be negative or greater than 255";
// private constructor -----------------------------------------------
/**
* Constructor with int
* @param compactversion a 32 bit int with each byte representing a number
*/
private VersionInfo(int compactversion)
{
m_version_ = compactversion;
}
/**
* Gets the int from the version numbers
* @param major non-negative version number
* @param minor non-negativeversion number
* @param milli non-negativeversion number
* @param micro non-negativeversion number
*/
private static int getInt(int major, int minor, int milli, int micro)
{
return (major << 24) | (minor << 16) | (milli << 8) | micro;
}
}