package org.apache.tika.language;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.tika.exception.TikaException;
import org.hsqldb.Tokens;

/* loaded from: input_file:APP-INF/lib/tika-core-1.5.jar:org/apache/tika/language/LanguageProfilerBuilder.class */
public class LanguageProfilerBuilder {
    static final int ABSOLUTE_MIN_NGRAM_LENGTH = 3;
    static final int ABSOLUTE_MAX_NGRAM_LENGTH = 3;
    static final int DEFAULT_MIN_NGRAM_LENGTH = 3;
    static final int DEFAULT_MAX_NGRAM_LENGTH = 3;
    static final String FILE_EXTENSION = "ngp";
    static final int MAX_SIZE = 1000;
    static final char SEPARATOR = '_';
    private static final String SEP_CHARSEQ = "_";
    private String name;
    private int minLength;
    private int maxLength;
    private Map<CharSequence, NGramEntry> ngrams;
    private List<NGramEntry> sorted = null;
    private int[] ngramcounts = null;
    private QuickStringBuffer word = new QuickStringBuffer();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:APP-INF/lib/tika-core-1.5.jar:org/apache/tika/language/LanguageProfilerBuilder$NGramEntry.class */
    public static class NGramEntry implements Comparable<NGramEntry> {
        private LanguageProfilerBuilder profile;
        CharSequence seq;
        private int count;
        private float frequency;

        public NGramEntry(CharSequence charSequence) {
            this.profile = null;
            this.seq = null;
            this.count = 0;
            this.frequency = 0.0f;
            this.seq = charSequence;
        }

        public NGramEntry(String str, int i) {
            this.profile = null;
            this.seq = null;
            this.count = 0;
            this.frequency = 0.0f;
            this.seq = new StringBuffer(str).subSequence(0, str.length());
            this.count = i;
        }

        public int getCount() {
            return this.count;
        }

        public float getFrequency() {
            return this.frequency;
        }

        public CharSequence getSeq() {
            return this.seq;
        }

        public int size() {
            return this.seq.length();
        }

        @Override // java.lang.Comparable
        public int compareTo(NGramEntry nGramEntry) {
            int compare = Float.compare(nGramEntry.getFrequency(), this.frequency);
            return compare != 0 ? compare : toString().compareTo(nGramEntry.toString());
        }

        public void inc() {
            this.count++;
        }

        public void setProfile(LanguageProfilerBuilder languageProfilerBuilder) {
            this.profile = languageProfilerBuilder;
        }

        public LanguageProfilerBuilder getProfile() {
            return this.profile;
        }

        public String toString() {
            return this.seq.toString();
        }

        public int hashCode() {
            return this.seq.hashCode();
        }

        public boolean equals(Object obj) {
            try {
                return ((NGramEntry) obj).seq.equals(this.seq);
            } catch (Exception e) {
                return false;
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:APP-INF/lib/tika-core-1.5.jar:org/apache/tika/language/LanguageProfilerBuilder$QuickStringBuffer.class */
    public static class QuickStringBuffer implements CharSequence {
        private char[] value;
        private int count;

        QuickStringBuffer() {
            this(16);
        }

        QuickStringBuffer(char[] cArr) {
            this.value = cArr;
            this.count = cArr.length;
        }

        QuickStringBuffer(int i) {
            this.value = new char[i];
        }

        QuickStringBuffer(String str) {
            this(str.length() + 16);
            append(str);
        }

        @Override // java.lang.CharSequence
        public int length() {
            return this.count;
        }

        private void expandCapacity(int i) {
            int length = (this.value.length + 1) * 2;
            if (length < 0) {
                length = Integer.MAX_VALUE;
            } else if (i > length) {
                length = i;
            }
            char[] cArr = new char[length];
            System.arraycopy(this.value, 0, cArr, 0, this.count);
            this.value = cArr;
        }

        QuickStringBuffer clear() {
            this.count = 0;
            return this;
        }

        @Override // java.lang.CharSequence
        public char charAt(int i) {
            return this.value[i];
        }

        QuickStringBuffer append(String str) {
            if (str == null) {
                str = String.valueOf(str);
            }
            int length = str.length();
            int i = this.count + length;
            if (i > this.value.length) {
                expandCapacity(i);
            }
            str.getChars(0, length, this.value, this.count);
            this.count = i;
            return this;
        }

        QuickStringBuffer append(char c) {
            int i = this.count + 1;
            if (i > this.value.length) {
                expandCapacity(i);
            }
            char[] cArr = this.value;
            int i2 = this.count;
            this.count = i2 + 1;
            cArr[i2] = c;
            return this;
        }

        @Override // java.lang.CharSequence
        public CharSequence subSequence(int i, int i2) {
            return new String(this.value, i, i2 - i);
        }

        @Override // java.lang.CharSequence
        public String toString() {
            return new String(this.value);
        }
    }

    public LanguageProfilerBuilder(String str, int i, int i2) {
        this.name = null;
        this.minLength = 3;
        this.maxLength = 3;
        this.ngrams = null;
        this.ngrams = new HashMap(4000);
        this.minLength = i;
        this.maxLength = i2;
        this.name = str;
    }

    public LanguageProfilerBuilder(String str) {
        this.name = null;
        this.minLength = 3;
        this.maxLength = 3;
        this.ngrams = null;
        this.ngrams = new HashMap(4000);
        this.minLength = 3;
        this.maxLength = 3;
        this.name = str;
    }

    public String getName() {
        return this.name;
    }

    public void add(StringBuffer stringBuffer) {
        for (int i = this.minLength; i <= this.maxLength && i < stringBuffer.length(); i++) {
            add(stringBuffer, i);
        }
    }

    private void add(QuickStringBuffer quickStringBuffer) {
        int length = quickStringBuffer.length();
        if (length >= this.minLength) {
            int min = Math.min(this.maxLength, length);
            for (int i = this.minLength; i <= min; i++) {
                add(quickStringBuffer.subSequence(length - i, length));
            }
        }
    }

    private void add(CharSequence charSequence) {
        if (charSequence.equals(SEP_CHARSEQ)) {
            return;
        }
        NGramEntry nGramEntry = this.ngrams.get(charSequence);
        if (nGramEntry == null) {
            nGramEntry = new NGramEntry(charSequence);
            this.ngrams.put(charSequence, nGramEntry);
        }
        nGramEntry.inc();
    }

    public void analyze(StringBuilder sb) {
        if (this.ngrams != null) {
            this.ngrams.clear();
            this.sorted = null;
            this.ngramcounts = null;
        }
        this.word.clear().append('_');
        for (int i = 0; i < sb.length(); i++) {
            char lowerCase = Character.toLowerCase(sb.charAt(i));
            if (Character.isLetter(lowerCase)) {
                add(this.word.append(lowerCase));
            } else if (this.word.length() > 1) {
                add(this.word.append('_'));
                this.word.clear().append('_');
            }
        }
        if (this.word.length() > 1) {
            add(this.word.append('_'));
        }
        normalize();
    }

    private void add(StringBuffer stringBuffer, int i) {
        for (int i2 = 0; i2 <= stringBuffer.length() - i; i2++) {
            add(stringBuffer.subSequence(i2, i2 + i));
        }
    }

    protected void normalize() {
        if (this.ngramcounts == null) {
            this.ngramcounts = new int[this.maxLength + 1];
            for (NGramEntry nGramEntry : this.ngrams.values()) {
                int[] iArr = this.ngramcounts;
                int size = nGramEntry.size();
                iArr[size] = iArr[size] + nGramEntry.count;
            }
        }
        Iterator<NGramEntry> it = this.ngrams.values().iterator();
        while (it.hasNext()) {
            it.next().frequency = r0.count / this.ngramcounts[r0.size()];
        }
    }

    public List<NGramEntry> getSorted() {
        if (this.sorted == null) {
            this.sorted = new ArrayList(this.ngrams.values());
            Collections.sort(this.sorted);
            if (this.sorted.size() > 1000) {
                this.sorted = this.sorted.subList(0, 1000);
            }
        }
        return this.sorted;
    }

    public String toString() {
        StringBuffer append = new StringBuffer().append("NGramProfile: ").append(this.name).append("\n");
        for (NGramEntry nGramEntry : getSorted()) {
            append.append(Tokens.T_LEFTBRACKET).append(nGramEntry.seq).append("/").append(nGramEntry.count).append("/").append(nGramEntry.frequency).append("]\n");
        }
        return append.toString();
    }

    public float getSimilarity(LanguageProfilerBuilder languageProfilerBuilder) throws TikaException {
        float f = 0.0f;
        try {
            for (NGramEntry nGramEntry : languageProfilerBuilder.getSorted()) {
                f = this.ngrams.containsKey(nGramEntry.seq) ? f + (Math.abs(nGramEntry.frequency - this.ngrams.get(nGramEntry.seq).frequency) / 2.0f) : f + nGramEntry.frequency;
            }
            for (NGramEntry nGramEntry2 : getSorted()) {
                f = languageProfilerBuilder.ngrams.containsKey(nGramEntry2.seq) ? f + (Math.abs(nGramEntry2.frequency - languageProfilerBuilder.ngrams.get(nGramEntry2.seq).frequency) / 2.0f) : f + nGramEntry2.frequency;
            }
            return f;
        } catch (Exception e) {
            throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
        }
    }

    public void load(InputStream inputStream) throws IOException {
        this.ngrams.clear();
        this.ngramcounts = new int[this.maxLength + 1];
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                normalize();
                return;
            }
            if (readLine.charAt(0) != '#') {
                int indexOf = readLine.indexOf(32);
                String trim = readLine.substring(0, indexOf).trim();
                int length = trim.length();
                if (length >= this.minLength && length <= this.maxLength) {
                    int parseInt = Integer.parseInt(readLine.substring(indexOf + 1));
                    NGramEntry nGramEntry = new NGramEntry(trim, parseInt);
                    this.ngrams.put(nGramEntry.getSeq(), nGramEntry);
                    int[] iArr = this.ngramcounts;
                    iArr[length] = iArr[length] + parseInt;
                }
            }
        }
    }

    public static LanguageProfilerBuilder create(String str, InputStream inputStream, String str2) throws TikaException {
        LanguageProfilerBuilder languageProfilerBuilder = new LanguageProfilerBuilder(str, 3, 3);
        BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
        byte[] bArr = new byte[4096];
        StringBuilder sb = new StringBuilder();
        while (true) {
            try {
                int read = bufferedInputStream.read(bArr);
                if (read == -1) {
                    languageProfilerBuilder.analyze(sb);
                    return languageProfilerBuilder;
                }
                sb.append(new String(bArr, 0, read, str2));
            } catch (IOException e) {
                throw new TikaException("Could not create profile, " + e.getMessage());
            }
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v33, types: [java.util.List] */
    public void save(OutputStream outputStream) throws IOException {
        outputStream.write(("# NgramProfile generated at " + new Date() + " for Apache Tika Language Identification\n").getBytes());
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        NGramEntry[] nGramEntryArr = (NGramEntry[]) this.ngrams.values().toArray(new NGramEntry[this.ngrams.size()]);
        for (int i = this.minLength; i <= this.maxLength; i++) {
            for (int i2 = 0; i2 < nGramEntryArr.length; i2++) {
                if (nGramEntryArr[i2].getSeq().length() == i) {
                    arrayList2.add(nGramEntryArr[i2]);
                }
            }
            Collections.sort(arrayList2);
            if (arrayList2.size() > 1000) {
                arrayList2 = arrayList2.subList(0, 1000);
            }
            arrayList.addAll(arrayList2);
            arrayList2.clear();
        }
        for (int i3 = 0; i3 < arrayList.size(); i3++) {
            NGramEntry nGramEntry = (NGramEntry) arrayList.get(i3);
            outputStream.write((nGramEntry.toString() + " " + nGramEntry.getCount() + "\n").getBytes("UTF-8"));
        }
        outputStream.flush();
    }

    public static void main(String[] strArr) {
        boolean z = false;
        String str = "";
        String str2 = "";
        String str3 = "";
        String str4 = "";
        if (strArr.length == 0) {
            System.err.println("Usage: NGramProfile [-create profilename filename encoding] [-similarity file1 file2] [-score profile-name filename encoding]");
            System.exit(-1);
        }
        int i = 0;
        while (i < strArr.length) {
            if (strArr[i].equals("-create")) {
                z = true;
                int i2 = i + 1;
                str = strArr[i2];
                int i3 = i2 + 1;
                str2 = strArr[i3];
                i = i3 + 1;
                str4 = strArr[i];
            }
            if (strArr[i].equals("-similarity")) {
                z = 2;
                int i4 = i + 1;
                str2 = strArr[i4];
                int i5 = i4 + 1;
                str3 = strArr[i5];
                i = i5 + 1;
                str4 = strArr[i];
            }
            if (strArr[i].equals("-score")) {
                z = 3;
                int i6 = i + 1;
                str = strArr[i6];
                int i7 = i6 + 1;
                str2 = strArr[i7];
                i = i7 + 1;
                str4 = strArr[i];
            }
            i++;
        }
        try {
            switch (z) {
                case true:
                    FileInputStream fileInputStream = new FileInputStream(new File(str2));
                    LanguageProfilerBuilder create = create(str, fileInputStream, str4);
                    fileInputStream.close();
                    create.save(new FileOutputStream(new File(str + "." + FILE_EXTENSION)));
                    System.out.println("new profile " + str + "." + FILE_EXTENSION + " was created.");
                    break;
                case true:
                    LanguageProfilerBuilder create2 = create(str2, new FileInputStream(new File(str2)), str4);
                    create2.normalize();
                    LanguageProfilerBuilder create3 = create(str3, new FileInputStream(new File(str3)), str4);
                    create3.normalize();
                    System.out.println("Similarity is " + create2.getSimilarity(create3));
                    break;
                case true:
                    LanguageProfilerBuilder create4 = create(str2, new FileInputStream(new File(str2)), str4);
                    FileInputStream fileInputStream2 = new FileInputStream(new File(str + "." + FILE_EXTENSION));
                    LanguageProfilerBuilder languageProfilerBuilder = new LanguageProfilerBuilder(str, 3, 3);
                    languageProfilerBuilder.load(fileInputStream2);
                    System.out.println("Score is " + languageProfilerBuilder.getSimilarity(create4));
                    break;
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
