/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import org.apache.xerces.impl.io.UTF16Reader; import org.apache.xerces.util.XMLChar; /** * This program tests the customized UTF-16 reader for the parser, * comparing it with the Java UTF-16 reader. * * @version $Id$ */ public class UTF16 { // // MAIN // /** Main program entry. */ public static void main(String[] argv) throws Exception { testUTF16Decoder(true); testUTF16Decoder(false); } // main(String[]) // // Public static methods // public static void testUTF16Decoder(boolean isBigEndian) throws Exception { final int BLOCK_READ_SIZE = 2048; final String encoding = isBigEndian ? "UnicodeBig" : "UnicodeLittle"; final String shortName = isBigEndian ? "BE" : "LE"; // // Test Java reference implementation of UTF-16 decoder // System.err.println("#"); System.err.println("# Testing Java UTF-16" + shortName + " decoder"); System.err.println("#"); // test character by character try { InputStream stream = new UTF16Producer(isBigEndian); Reader reader = new InputStreamReader(stream, encoding); long time = testCharByChar(reader); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } // test character array try { InputStream stream = new UTF16Producer(isBigEndian); Reader reader = new InputStreamReader(stream, encoding); long time = testCharArray(reader, BLOCK_READ_SIZE); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } // // Test custom implementation of UTF-16 decoder // System.err.println("#"); System.err.println("# Testing custom UTF-16" + shortName + " decoder"); System.err.println("#"); // test character by character try { InputStream stream = new UTF16Producer(isBigEndian); Reader reader = new UTF16Reader(stream, isBigEndian); long time = testCharByChar(reader); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } // test character array try { InputStream stream = new UTF16Producer(isBigEndian); Reader reader = new UTF16Reader(stream, isBigEndian); long time = testCharArray(reader, BLOCK_READ_SIZE); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } } /** This function tests the specified reader character by character. */ public static long testCharByChar(Reader reader) throws Exception { long before = System.currentTimeMillis(); System.err.println("# Testing character by character"); System.err.println("testing 0x000000 -> 0x00D7FF"); for (int i = 0; i < 0xD800; i++) { int c = reader.read(); if (c != i) { UTF8.expectedChar(null, i, c); } } System.err.println("testing 0x00E000 -> 0x00FFFD"); for (int i = 0xE000; i < 0xFFFE; i++) { int c = reader.read(); if (c != i) { UTF8.expectedChar(null, i, c); } } System.err.println("testing 0x010000 -> 0x10FFFF"); for (int i = 0x10000; i < 0x110000; i++) { // vars int uuuuu = (i >> 16) & 0x001F; int wwww = uuuuu - 1; int zzzz = (i >> 12) & 0x000F; int yyyyyy = (i >> 6) & 0x003F; int xxxxxx = i & 0x003F; int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; // high surrogate int c = reader.read(); if (c != hs) { UTF8.expectedChar("high surrogate", hs, c); } // low surrogate c = reader.read(); if (c != ls) { UTF8.expectedChar("low surrogate", ls, c); } } System.err.println("checking EOF"); int c = reader.read(); if (c != -1) { UTF8.extraChar(c); } long after = System.currentTimeMillis(); return after - before; } // testCharByChar(Reader):long /** * This function tests the given reader by performing block character * reads of the specified size. */ public static long testCharArray(Reader reader, int size) throws Exception { long before = System.currentTimeMillis(); System.err.println("# Testing character array of size "+size); char[] ch = new char[size]; int count = 0; int position = 0; System.err.println("testing 0x000000 -> 0x00D7FF"); for (int i = 0; i < 0xD800; i++) { if (position == count) { count = UTF8.load(reader, ch); position = 0; } int c = ch[position++]; if (c != i) { UTF8.expectedChar(null, i, c); } } System.err.println("testing 0x00E000 -> 0x00FFFD"); for (int i = 0xE000; i < 0xFFFE; i++) { if (position == count) { count = UTF8.load(reader, ch); position = 0; } int c = ch[position++]; if (c != i) { UTF8.expectedChar(null, i, c); } } System.err.println("testing 0x010000 -> 0x110000"); for (int i = 0x10000; i < 0x110000; i++) { // vars int uuuuu = (i >> 16) & 0x001F; int wwww = uuuuu - 1; int zzzz = (i >> 12) & 0x000F; int yyyyyy = (i >> 6) & 0x003F; int xxxxxx = i & 0x003F; int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; // high surrogate if (position == count) { count = UTF8.load(reader, ch); position = 0; } int c = ch[position++]; if (c != hs) { UTF8.expectedChar("high surrogate", hs, c); } // low surrogate if (position == count) { count = UTF8.load(reader, ch); position = 0; } c = ch[position++]; if (c != ls) { UTF8.expectedChar("low surrogate", ls, c); } } System.err.println("checking EOF"); if (position == count) { count = UTF8.load(reader, ch); position = 0; } if (count != -1) { UTF8.extraChar(ch[position]); } long after = System.currentTimeMillis(); return after - before; } // testCharArray(Reader):long // // Classes // /** * This classes produces a stream of UTF-16 byte sequences for all * valid Unicode characters. */ public static class UTF16Producer extends InputStream { // // Data // /** The current code point. */ private int fCodePoint; /** The current byte of the current code point. */ private int fByte; /** Endianness. */ private final boolean fIsBigEndian; // // Constructors // public UTF16Producer(boolean isBigEndian) { fIsBigEndian = isBigEndian; } // // InputStream methods // /** Reads the next character. */ public int read() throws IOException { if (fCodePoint < 0xFFFE) { // skip surrogate blocks if (fCodePoint == 0xD800) { fCodePoint = 0xE000; } switch (fByte) { case 0: { final int b; if (fIsBigEndian) { b = fCodePoint >> 8; } else { b = fCodePoint & 0xff; } fByte++; return b; } case 1: { final int b; if (fIsBigEndian) { b = fCodePoint & 0xff; } else { b = fCodePoint >> 8; } fCodePoint++; fByte = 0; return b; } default: { throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); } } } if (fCodePoint == 0xFFFE) { fCodePoint = 0x10000; } if (fCodePoint < 0x110000) { switch (fByte) { case 0: { final int b; if (fIsBigEndian) { b = XMLChar.highSurrogate(fCodePoint) >> 8; } else { b = XMLChar.highSurrogate(fCodePoint) & 0xff; } fByte++; return b; } case 1: { final int b; if (fIsBigEndian) { b = XMLChar.highSurrogate(fCodePoint) & 0xff; } else { b = XMLChar.highSurrogate(fCodePoint) >> 8; } fByte++; return b; } case 2: { final int b; if (fIsBigEndian) { b = XMLChar.lowSurrogate(fCodePoint) >> 8; } else { b = XMLChar.lowSurrogate(fCodePoint) & 0xff; } fByte++; return b; } case 3: { final int b; if (fIsBigEndian) { b = XMLChar.lowSurrogate(fCodePoint) & 0xff; } else { b = XMLChar.lowSurrogate(fCodePoint) >> 8; } fCodePoint++; fByte = 0; return b; } default: { throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); } } } return -1; } } }