/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import org.apache.xerces.impl.io.UTF8Reader; /** * This program tests the customized UTF-8 reader for the parser, * comparing it with the Java UTF-8 reader. Interestingly, when * reading character by character (as opposed to block character * reads), the Java reader silently skips surrogate characters * on the input! I've seen this behavior in 1.1.8, 1.2, and 1.3 * under the Windows platform. * * @author Andy Clark, IBM * * @version $Id$ */ public class UTF8 { // // MAIN // /** Main program entry. */ public static void main(String[] argv) throws Exception { final int BLOCK_READ_SIZE = 2048; // // Test Java reference implementation of UTF-8 decoder // System.err.println("#"); System.err.println("# Testing Java UTF-8 decoder"); System.err.println("#"); // test character by character try { InputStream stream = new UTF8Producer(); Reader reader = new InputStreamReader(stream, "UTF8"); long time = testCharByChar(reader); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } // test character array try { InputStream stream = new UTF8Producer(); Reader reader = new InputStreamReader(stream, "UTF8"); long time = testCharArray(reader, BLOCK_READ_SIZE); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } // // Test custom implementation of UTF-8 decoder // System.err.println("#"); System.err.println("# Testing custom UTF-8 decoder"); System.err.println("#"); // test character by character try { InputStream stream = new UTF8Producer(); Reader reader = new UTF8Reader(stream); long time = testCharByChar(reader); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } // test character array try { InputStream stream = new UTF8Producer(); Reader reader = new UTF8Reader(stream); long time = testCharArray(reader, BLOCK_READ_SIZE); System.err.println("PASS ("+time+" ms)"); reader.close(); } catch (IOException e) { System.err.println("FAIL: "+e.getMessage()); } } // main(String[]) // // Public static methods // /** This function tests the specified reader character by character. */ public static long testCharByChar(Reader reader) throws Exception { long before = System.currentTimeMillis(); System.err.println("# Testing character by character"); System.err.println("testing 0x000000 -> 0x00007F"); for (int i = 0; i < 0x0080; i++) { int c = reader.read(); if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x000080 -> 0x0007FF"); for (int i = 0x0080; i < 0x0800; i++) { int c = reader.read(); if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x000800 -> 0x00D7FF"); for (int i = 0x0800; i < 0xD800; i++) { int c = reader.read(); if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x00E000 -> 0x00FFFF"); for (int i = 0xE000; i < 0x010000; i++) { int c = reader.read(); if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x010000 -> 0x110000"); for (int i = 0x10000; i < 0x110000; i++) { // vars int uuuuu = (i >> 16) & 0x001F; int wwww = uuuuu - 1; int zzzz = (i >> 12) & 0x000F; int yyyyyy = (i >> 6) & 0x003F; int xxxxxx = i & 0x003F; int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; // high surrogate int c = reader.read(); if (c != hs) { expectedChar("high surrogate", hs, c); } // low surrogate c = reader.read(); if (c != ls) { expectedChar("low surrogate", ls, c); } } System.err.println("checking EOF"); int c = reader.read(); if (c != -1) { extraChar(c); } long after = System.currentTimeMillis(); return after - before; } // testCharByChar(Reader):long /** * This function tests the given reader by performing block character * reads of the specified size. */ public static long testCharArray(Reader reader, int size) throws Exception { long before = System.currentTimeMillis(); System.err.println("# Testing character array of size "+size); char[] ch = new char[size]; int count = 0; int position = 0; System.err.println("testing 0x000000 -> 0x00007F"); for (int i = 0; i < 0x0080; i++) { if (position == count) { count = load(reader, ch); position = 0; } int c = ch[position++]; if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x000080 -> 0x0007FF"); for (int i = 0x0080; i < 0x0800; i++) { if (position == count) { count = load(reader, ch); position = 0; } int c = ch[position++]; if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x000800 -> 0x00D7FF"); for (int i = 0x0800; i < 0xD800; i++) { if (position == count) { count = load(reader, ch); position = 0; } int c = ch[position++]; if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x00E000 -> 0x00FFFF"); for (int i = 0xE000; i < 0x010000; i++) { if (position == count) { count = load(reader, ch); position = 0; } int c = ch[position++]; if (c != i) { expectedChar(null, i, c); } } System.err.println("testing 0x010000 -> 0x10FFFF"); for (int i = 0x10000; i < 0x110000; i++) { // vars int uuuuu = (i >> 16) & 0x001F; int wwww = uuuuu - 1; int zzzz = (i >> 12) & 0x000F; int yyyyyy = (i >> 6) & 0x003F; int xxxxxx = i & 0x003F; int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4); int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; // high surrogate if (position == count) { count = load(reader, ch); position = 0; } int c = ch[position++]; if (c != hs) { expectedChar("high surrogate", hs, c); } // low surrogate if (position == count) { count = load(reader, ch); position = 0; } c = ch[position++]; if (c != ls) { expectedChar("low surrogate", ls, c); } } System.err.println("checking EOF"); if (position == count) { count = load(reader, ch); position = 0; } if (count != -1) { extraChar(ch[position]); } long after = System.currentTimeMillis(); return after - before; } // testCharArray(Reader):long // // Package private static methods // /** Loads another block of characters from the reader. */ static int load(Reader reader, char[] ch) throws IOException { int count = reader.read(ch, 0, ch.length); return count; } // load(Reader,char[]):int /** Creates an I/O exception for expected character. */ static void expectedChar(String prefix, int ec, int fc) throws IOException { StringBuffer str = new StringBuffer(); str.append("expected "); if (prefix != null) { str.append(prefix); str.append(' '); } str.append("0x"); str.append(Integer.toHexString(ec)); str.append(" but found 0x"); if (fc != -1) { str.append(Integer.toHexString(fc)); } else { str.append("EOF"); } String message = str.toString(); throw new IOException(message); } // expectedChar(String,int,int) /** Creates an I/O exception for extra character. */ static void extraChar(int c) throws IOException { StringBuffer str = new StringBuffer(); str.append("found extra character 0x"); str.append(Integer.toHexString(c)); String message = str.toString(); throw new IOException(message); } // extraChar(int) // // Classes // /** * This classes produces a stream of UTF-8 byte sequences for all * valid Unicode characters. * * @author Andy Clark, IBM */ public static class UTF8Producer extends InputStream { // // Data // /** The current code point. */ private int fCodePoint; /** The current byte of the current code point. */ private int fByte; // // InputStream methods // /** Reads the next character. */ public int read() throws IOException { // UTF-8: [0xxx xxxx] // Unicode: [0000 0000] [0xxx xxxx] if (fCodePoint < 0x0080) { int b = fCodePoint; fCodePoint++; fByte = 0; return b; } // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] if (fCodePoint < 0x0800) { switch (fByte) { case 0: { int b = 0x00C0 | ((fCodePoint >> 6) & 0x001F); fByte++; return b; } case 1: { int b = 0x0080 | (fCodePoint & 0x003F); fCodePoint++; fByte = 0; return b; } default: { throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence"); } } } // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] // Unicode: [zzzz yyyy] [yyxx xxxx]* if (fCodePoint < 0x10000) { switch (fByte) { case 0: { int b = 0x00E0 | ((fCodePoint >> 12) & 0x000F); fByte++; return b; } case 1: { int b = 0x0080 | ((fCodePoint >> 6) & 0x003F); fByte++; return b; } case 2: { int b = 0x0080 | (fCodePoint & 0x003F); fCodePoint++; // skip surrogate blocks if (fCodePoint == 0xD800) { fCodePoint = 0xE000; } fByte = 0; return b; } default: { throw new RuntimeException("byte "+fByte+" of 3 byte UTF-8 sequence"); } } } // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) // [1101 11yy] [yyxx xxxx] (low surrogate) // * uuuuu = wwww + 1 // [0000 0000] [000u uuuu] [zzzz yyyy] [yyxx xxxx] if (fCodePoint < 0x110000) { switch (fByte) { case 0: { int uuuuu = (fCodePoint >> 16) & 0x001F; int b = 0x00F0 | (uuuuu >> 2); fByte++; return b; } case 1: { int uuuuu = (fCodePoint >> 16) & 0x001F; int zzzz = (fCodePoint >> 12) & 0x000F; int b = 0x0080 | ((uuuuu << 4) & 0x0030) | zzzz; fByte++; return b; } case 2: { int yyyyyy = (fCodePoint >> 6) & 0x003F; int b = 0x0080 | yyyyyy; fByte++; return b; } case 3: { int xxxxxx = fCodePoint & 0x003F; int b = 0x0080 | xxxxxx; fCodePoint++; fByte = 0; return b; } default: { throw new RuntimeException("byte "+fByte+" of 4 byte UTF-8 sequence"); } } } // done return -1; } // read():int } // class UTF8Producer } // class UTF8