/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;

import org.apache.xerces.impl.io.UTF8Reader;

/**
 * This program tests the customized UTF-8 reader for the parser,
 * comparing it with the Java UTF-8 reader. Interestingly, when
 * reading character by character (as opposed to block character
 * reads), the Java reader silently skips surrogate characters
 * on the input! I've seen this behavior in 1.1.8, 1.2, and 1.3
 * under the Windows platform.
 *
 * @author Andy Clark, IBM
 *
 * @version $Id$
 */
public class UTF8 {

    //
    // MAIN
    //

    /** Main program entry. */
    public static void main(String[] argv) throws Exception {

        final int BLOCK_READ_SIZE = 2048;

        //
        // Test Java reference implementation of UTF-8 decoder
        //

        System.err.println("#");
        System.err.println("# Testing Java UTF-8 decoder");
        System.err.println("#");

        // test character by character
        try {
            InputStream stream = new UTF8Producer();
            Reader reader = new InputStreamReader(stream, "UTF8");
            long time = testCharByChar(reader);
            System.err.println("PASS ("+time+" ms)");
            reader.close();
        } 
        catch (IOException e) {
            System.err.println("FAIL: "+e.getMessage());
        }
        
        // test character array
        try {
            InputStream stream = new UTF8Producer();
            Reader reader = new InputStreamReader(stream, "UTF8");
            long time = testCharArray(reader, BLOCK_READ_SIZE);
            System.err.println("PASS ("+time+" ms)");
            reader.close();
        } 
        catch (IOException e) {
            System.err.println("FAIL: "+e.getMessage());
        }
        
        //
        // Test custom implementation of UTF-8 decoder
        //

        System.err.println("#");
        System.err.println("# Testing custom UTF-8 decoder");
        System.err.println("#");

        // test character by character
        try {
            InputStream stream = new UTF8Producer();
            Reader reader = new UTF8Reader(stream);
            long time = testCharByChar(reader);
            System.err.println("PASS ("+time+" ms)");
            reader.close();
        } 
        catch (IOException e) {
            System.err.println("FAIL: "+e.getMessage());
        }
        
        // test character array
        try {
            InputStream stream = new UTF8Producer();
            Reader reader = new UTF8Reader(stream);
            long time = testCharArray(reader, BLOCK_READ_SIZE);
            System.err.println("PASS ("+time+" ms)");
            reader.close();
        } 
        catch (IOException e) {
            System.err.println("FAIL: "+e.getMessage());
        }
        
    } // main(String[])

    //
    // Public static methods
    //

    /** This function tests the specified reader character by character. */
    public static long testCharByChar(Reader reader) throws Exception {

        long before = System.currentTimeMillis();
        System.err.println("# Testing character by character");

        System.err.println("testing 0x000000 -> 0x00007F");
        for (int i = 0; i < 0x0080; i++) {
            int c = reader.read();
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x000080 -> 0x0007FF");
        for (int i = 0x0080; i < 0x0800; i++) {
            int c = reader.read();
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x000800 -> 0x00D7FF");
        for (int i = 0x0800; i < 0xD800; i++) {
            int c = reader.read();
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x00E000 -> 0x00FFFF");
        for (int i = 0xE000; i < 0x010000; i++) {
            int c = reader.read();
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x010000 -> 0x110000");
        for (int i = 0x10000; i < 0x110000; i++) {
            // vars
            int uuuuu = (i >> 16) & 0x001F;
            int wwww = uuuuu - 1;
            int zzzz = (i >> 12) & 0x000F;
            int yyyyyy = (i >> 6) & 0x003F;
            int xxxxxx = i & 0x003F;
            int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4);
            int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
            // high surrogate
            int c = reader.read();
            if (c != hs) {
                expectedChar("high surrogate", hs, c);
            }
            // low surrogate
            c = reader.read();
            if (c != ls) {
                expectedChar("low surrogate", ls, c);
            }
        }
        System.err.println("checking EOF");
        int c = reader.read();
        if (c != -1) {
            extraChar(c);
        }
        long after = System.currentTimeMillis();

        return after - before;

    } // testCharByChar(Reader):long

    /**
     * This function tests the given reader by performing block character
     * reads of the specified size.
     */
    public static long testCharArray(Reader reader, int size) throws Exception {

        long before = System.currentTimeMillis();
        System.err.println("# Testing character array of size "+size);

        char[] ch = new char[size];
        int count = 0;
        int position = 0;

        System.err.println("testing 0x000000 -> 0x00007F");
        for (int i = 0; i < 0x0080; i++) {
            if (position == count) {
                count = load(reader, ch);
                position = 0;
            }
            int c = ch[position++];
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x000080 -> 0x0007FF");
        for (int i = 0x0080; i < 0x0800; i++) {
            if (position == count) {
                count = load(reader, ch);
                position = 0;
            }
            int c = ch[position++];
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x000800 -> 0x00D7FF");
        for (int i = 0x0800; i < 0xD800; i++) {
            if (position == count) {
                count = load(reader, ch);
                position = 0;
            }
            int c = ch[position++];
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x00E000 -> 0x00FFFF");
        for (int i = 0xE000; i < 0x010000; i++) {
            if (position == count) {
                count = load(reader, ch);
                position = 0;
            }
            int c = ch[position++];
            if (c != i) {
                expectedChar(null, i, c);
            }
        }
        System.err.println("testing 0x010000 -> 0x10FFFF");
        for (int i = 0x10000; i < 0x110000; i++) {
            // vars
            int uuuuu = (i >> 16) & 0x001F;
            int wwww = uuuuu - 1;
            int zzzz = (i >> 12) & 0x000F;
            int yyyyyy = (i >> 6) & 0x003F;
            int xxxxxx = i & 0x003F;
            int hs = 0xD800 | (wwww << 6) | (zzzz << 2) | (yyyyyy >> 4);
            int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
            // high surrogate
            if (position == count) {
                count = load(reader, ch);
                position = 0;
            }
            int c = ch[position++];
            if (c != hs) {
                expectedChar("high surrogate", hs, c);
            }
            // low surrogate
            if (position == count) {
                count = load(reader, ch);
                position = 0;
            }
            c = ch[position++];
            if (c != ls) {
                expectedChar("low surrogate", ls, c);
            }
        }
        System.err.println("checking EOF");
        if (position == count) {
            count = load(reader, ch);
            position = 0;
        }
        if (count != -1) {
            extraChar(ch[position]);
        }
        long after = System.currentTimeMillis();

        return after - before;

    } // testCharArray(Reader):long

    //
    // Package private static methods
    //

    /** Loads another block of characters from the reader. */
    static int load(Reader reader, char[] ch) throws IOException {
        int count = reader.read(ch, 0, ch.length);
        return count;
    } // load(Reader,char[]):int

    /** Creates an I/O exception for expected character. */
    static void expectedChar(String prefix, int ec, int fc) throws IOException {
        StringBuffer str = new StringBuffer();
        str.append("expected ");
        if (prefix != null) {
            str.append(prefix);
            str.append(' ');
        }
        str.append("0x");
        str.append(Integer.toHexString(ec));
        str.append(" but found 0x");
        if (fc != -1) {
            str.append(Integer.toHexString(fc));
        }
        else {
            str.append("EOF");
        }
        String message = str.toString();
        throw new IOException(message);
    } // expectedChar(String,int,int)

    /** Creates an I/O exception for extra character. */
    static void extraChar(int c) throws IOException {
        StringBuffer str = new StringBuffer();
        str.append("found extra character 0x");
        str.append(Integer.toHexString(c));
        String message = str.toString();
        throw new IOException(message);
    } // extraChar(int)

    //
    // Classes
    //

    /**
     * This classes produces a stream of UTF-8 byte sequences for all 
     * valid Unicode characters.
     *
     * @author Andy Clark, IBM
     */
    public static class UTF8Producer
        extends InputStream {

        //
        // Data
        //

        /** The current code point. */
        private int fCodePoint;

        /** The current byte of the current code point. */
        private int fByte;

        //
        // InputStream methods
        //

        /** Reads the next character. */
        public int read() throws IOException {

            // UTF-8:   [0xxx xxxx]
            // Unicode: [0000 0000] [0xxx xxxx]
            if (fCodePoint < 0x0080) {
                int b = fCodePoint;
                fCodePoint++;
                fByte = 0;
                return b;
            }

            // UTF-8:   [110y yyyy] [10xx xxxx]
            // Unicode: [0000 0yyy] [yyxx xxxx]
            if (fCodePoint < 0x0800) {
                switch (fByte) {
                    case 0: {
                        int b = 0x00C0 | ((fCodePoint >> 6) & 0x001F);
                        fByte++;
                        return b;
                    }
                    case 1: {
                        int b = 0x0080 | (fCodePoint & 0x003F);
                        fCodePoint++;
                        fByte = 0;
                        return b;
                    }
                    default: {
                        throw new RuntimeException("byte "+fByte+" of 2 byte UTF-8 sequence");
                    }
                }
            }

            // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
            // Unicode: [zzzz yyyy] [yyxx xxxx]*
            if (fCodePoint < 0x10000) {
                switch (fByte) {
                    case 0: {
                        int b = 0x00E0 | ((fCodePoint >> 12) & 0x000F);
                        fByte++;
                        return b;
                    }
                    case 1: {
                        int b = 0x0080 | ((fCodePoint >> 6) & 0x003F);
                        fByte++;
                        return b;
                    }
                    case 2: {
                        int b = 0x0080 | (fCodePoint & 0x003F);
                        fCodePoint++;
                        // skip surrogate blocks
                        if (fCodePoint == 0xD800) {
                            fCodePoint = 0xE000;
                        }
                        fByte = 0;
                        return b;
                    }
                    default: {
                        throw new RuntimeException("byte "+fByte+" of 3 byte UTF-8 sequence");
                    }
                }
            }

            // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
            // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
            //          [1101 11yy] [yyxx xxxx] (low surrogate)
            //          * uuuuu = wwww + 1
            //          [0000 0000] [000u uuuu] [zzzz yyyy] [yyxx xxxx]
            if (fCodePoint < 0x110000) {
                switch (fByte) {
                    case 0: {
                        int uuuuu = (fCodePoint >> 16) & 0x001F;
                        int b = 0x00F0 | (uuuuu >> 2);
                        fByte++;
                        return b;
                    }
                    case 1: {
                        int uuuuu = (fCodePoint >> 16) & 0x001F;
                        int zzzz = (fCodePoint >> 12) & 0x000F;
                        int b = 0x0080 | ((uuuuu << 4) & 0x0030) | zzzz;
                        fByte++;
                        return b;
                    }
                    case 2: {
                        int yyyyyy = (fCodePoint >> 6) & 0x003F;
                        int b = 0x0080 | yyyyyy;
                        fByte++;
                        return b;
                    }
                    case 3: {
                        int xxxxxx = fCodePoint & 0x003F;
                        int b = 0x0080 | xxxxxx;
                        fCodePoint++;
                        fByte = 0;
                        return b;
                    }
                    default: {
                        throw new RuntimeException("byte "+fByte+" of 4 byte UTF-8 sequence");
                    }
                }
            }
            
            // done
            return -1;

        } // read():int

    } // class UTF8Producer

} // class UTF8