package org.apache.stanbol.enhancer.engines.htmlextractor.impl;

import com.ibm.icu.text.CharsetDetector;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.abdera.util.Constants;
import org.jbpm.formModeler.core.processing.FormProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/org.apache.stanbol.enhancer.engines.htmlextractor-0.10.0.jar:org/apache/stanbol/enhancer/engines/htmlextractor/impl/CharsetRecognizer.class */
public class CharsetRecognizer {
    private static final Logger LOG = LoggerFactory.getLogger(CharsetRecognizer.class);

    private static String checkPattern(String str, String str2, int i) {
        Matcher matcher = Pattern.compile(str2).matcher(str);
        if (matcher.find()) {
            return matcher.group(i);
        }
        return null;
    }

    private static String checkFormat(String str, InputStream inputStream) throws IOException {
        String str2 = null;
        String str3 = null;
        inputStream.mark(4096);
        if (str.equalsIgnoreCase("xml")) {
            str3 = "UTF-8";
            byte[] bArr = new byte[80];
            int read = inputStream.read(bArr);
            inputStream.reset();
            str2 = checkPattern(new String(bArr, 0, read, "US-ASCII"), "encoding=\"(\\w[-\\w]+)\"", 1);
        } else if (str.equalsIgnoreCase(Constants.HTML)) {
            byte[] bArr2 = new byte[2048];
            int read2 = inputStream.read(bArr2);
            inputStream.reset();
            str2 = checkPattern(new String(bArr2, 0, read2, "US-ASCII"), "<meta .*?content=\".*charset=(\\w[-\\w]+).*?/>", 1);
        }
        if (str2 == null) {
            return str3;
        }
        String upperCase = str2.toUpperCase();
        LOG.debug(str.toUpperCase() + " encoding: " + upperCase);
        return upperCase;
    }

    public static String detect(InputStream inputStream) throws IOException {
        return detect(inputStream, null, null);
    }

    public static String detect(InputStream inputStream, String str, String str2) throws IOException {
        String checkFormat;
        if (!inputStream.markSupported()) {
            throw new IOException("Mark not supported by input stream");
        }
        if (str != null && (checkFormat = checkFormat(str, inputStream)) != null) {
            return checkFormat;
        }
        CharsetDetector charsetDetector = new CharsetDetector();
        if (str2 != null) {
            charsetDetector.setDeclaredEncoding(str2);
        }
        charsetDetector.setText(inputStream);
        String name = charsetDetector.detect().getName();
        LOG.debug("Encoding: " + name);
        return name;
    }

    public static void main(String[] strArr) {
        String str = null;
        String str2 = null;
        int i = 0;
        while (i < strArr.length && strArr[i].startsWith(FormProcessor.NAMESPACE_SEPARATOR)) {
            String substring = strArr[i].substring(1);
            if (substring.startsWith("f")) {
                i++;
                str = strArr[i];
            } else if (substring.startsWith("e")) {
                i++;
                str2 = strArr[i];
            } else {
                System.err.println("illegal option: " + substring);
                System.exit(1);
            }
            i++;
        }
        for (int i2 = i; i2 < strArr.length; i2++) {
            try {
                BufferedInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(strArr[i2]));
                System.out.println("Encoding: " + detect(bufferedInputStream, str, str2) + ": " + strArr[i2]);
                bufferedInputStream.close();
            } catch (IOException e) {
                LOG.error(e.getMessage());
            }
        }
    }
}
