package org.jasen.core.token;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import org.jasen.core.linguistics.LinguisticAnalyzer;
import org.jasen.core.parsers.URLParser;
import org.jasen.interfaces.TokenErrorRecorder;
import org.jasen.util.DNSUtils;
import org.jasen.util.MimeUtils;

/* loaded from: input_file:jasen.jar:org/jasen/core/token/SpamTokenizer.class */
public class SpamTokenizer {
    private static final int VALID_TOKEN = 0;
    private static final int INVALID_TOKEN_TOO_LONG = 1;
    private static final int INVALID_TOKEN_TOO_SHORT = 2;
    private static final int INVALID_TOKEN_STOP_WORD = 3;
    private static final int INVALID_TOKEN_LINGUISTIC_ERROR = 4;
    private static final int INVALID_TOKEN_ONLY_NUMERIC = 5;
    private static final int INVALID_TOKEN_MAX_TOKENS_EXCEEDED = 6;
    protected int maxTokens = 50;
    protected int linguisticLimit = 3;
    public static int MIN_TOKEN_LENGTH = 3;
    public static int MAX_TOKEN_LENGTH = 12;
    public static double TOKEN_RECOGNITION_THRESHOLD = 0.1d;
    public static String[] STOP_WORDS = {"about", "again", "after", "all", "and", "another", "are", "arial", "because", "been", "but", "can", "did", "div", "does", "down", "each", "file", "find", "font", "for", "from", "ftp", "had", "has", "have", "helvetica", "her", "him", "his", "how", "href", "html", "http", "into", "its", "just", "know", "like", "made", "mailto", "make", "many", "may", "more", "most", "not", "one", "only", "other", "our", "out", "over", "said", "sans", "see", "serif", "she", "some", "such", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "two", "use", "very", "was", "way", "we", "we", "were", "what", "when", "where", "which", "who", "will", "with", "would", "you", "your"};
    public static char[] STOP_CHARS = {'\"', '#', '\'', '(', ')', '*', '+', ':', ';', '<', '>', '[', '\\', ']', '^', '`', '{', '|', '}'};
    public static char[] DELIMITER_CHARS = {'-', '=', '?', '_', '~'};

    static {
        Arrays.sort(STOP_WORDS);
        Arrays.sort(STOP_CHARS);
        Arrays.sort(DELIMITER_CHARS);
    }

    public SpamTokenizer() {
        LinguisticAnalyzer.getInstance();
    }

    public String[] tokenize(String str, boolean z, TokenErrorRecorder tokenErrorRecorder) throws IOException {
        return tokenize(new StringReader(str), z, tokenErrorRecorder);
    }

    public String[] tokenize(String str, TokenErrorRecorder tokenErrorRecorder) throws IOException {
        if (str != null) {
            return tokenize((Reader) new StringReader(str), false, tokenErrorRecorder);
        }
        return null;
    }

    /* JADX WARN: Can't fix incorrect switch cases order, some code will duplicate */
    /* JADX WARN: Code restructure failed: missing block: B:41:0x03f5, code lost:
    
        if (r12 == null) goto L156;
     */
    /* JADX WARN: Code restructure failed: missing block: B:42:0x03f8, code lost:
    
        r11 = r12.size();
     */
    /* JADX WARN: Code restructure failed: missing block: B:43:0x0401, code lost:
    
        r0.delete(0, r0.length());
        r28 = false;
        r21 = false;
        r25 = false;
        r35 = 0;
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    public java.lang.String[] tokenize(java.io.Reader r8, boolean r9, org.jasen.interfaces.TokenErrorRecorder r10) throws java.io.IOException {
        /*
            Method dump skipped, instructions count: 1129
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.jasen.core.token.SpamTokenizer.tokenize(java.io.Reader, boolean, org.jasen.interfaces.TokenErrorRecorder):java.lang.String[]");
    }

    private int isValidToken(String str, boolean z, int i, int i2) {
        if (str.trim().length() < MIN_TOKEN_LENGTH) {
            return 2;
        }
        if (Arrays.binarySearch(STOP_WORDS, str.toLowerCase()) > -1) {
            return 3;
        }
        if (isOnlyNumeric(str)) {
            return 5;
        }
        if (!z) {
            if (!LinguisticAnalyzer.getInstance().isWord(str)) {
                return 4;
            }
            if (i2 + 1 > i) {
                return 6;
            }
        }
        return i2 + 1 > i ? 6 : 0;
    }

    private List addToken(StringBuffer stringBuffer, String str, List list, boolean z, boolean z2) {
        boolean isIPAddress = DNSUtils.isIPAddress(str);
        if (z2 && (z || isIPAddress)) {
            if (list == null) {
                list = new LinkedList();
            }
            if (!isIPAddress) {
                str = DNSUtils.getValidDomainOnly(str);
            }
            if (str != null && list != null) {
                list.add(str.trim());
            }
        } else if (!z2) {
            if (list == null) {
                list = new LinkedList();
            }
            int indexOf = str.indexOf(64);
            if (indexOf > -1 && MimeUtils.isValidAddress(str)) {
                str = new StringBuffer(URLParser.URL_PREFIX).append(str.substring(indexOf + 1, str.length())).toString();
            } else if (z && !isIPAddress) {
                str = DNSUtils.getValidDomainOnly(str);
                if (str != null) {
                    str = new StringBuffer(URLParser.URL_PREFIX).append(str).toString();
                }
            } else if (isIPAddress) {
                str = new StringBuffer(URLParser.URL_PREFIX).append(str).toString();
            }
            if (str != null && list != null) {
                list.add(str.trim());
            }
        }
        return list;
    }

    private boolean isOnlyNumeric(String str) {
        char[] charArray = str.toCharArray();
        boolean z = true;
        int i = 0;
        while (true) {
            if (i >= charArray.length) {
                break;
            }
            if (!isInteger(charArray[i])) {
                z = false;
                break;
            }
            i++;
        }
        return z;
    }

    private char peek(Reader reader, char[] cArr) throws IOException {
        reader.mark(1);
        char c = reader.read(cArr) > -1 ? cArr[0] : (char) 65535;
        reader.reset();
        return c;
    }

    private boolean isSpace(char c) {
        return c == ' ' || c == 160 || c == 8199 || c == 8239;
    }

    private boolean isApostrophe(char c) {
        return c == '\'' || c == '`' || c == 180;
    }

    private boolean isValidApostropheNextChar(char c) {
        return c == 's' || c == 'S' || c == 't' || c == 'T' || c == 'r' || c == 'R' || c == 174;
    }

    private boolean isNormalAscii(char c) {
        return c >= '!' && c <= 'z';
    }

    private boolean isExtendedAscii(char c) {
        return c >= 128 && c <= 566;
    }

    private boolean isInteger(char c) {
        return c >= '0' && c <= '9';
    }

    private void appendChar(char c, StringBuffer stringBuffer, boolean z) {
        if (z) {
            return;
        }
        stringBuffer.append(c);
    }

    private char getExtendedReplacement(char c) {
        char c2 = c;
        int binarySearch = Arrays.binarySearch(LinguisticAnalyzer.EXTENDED_UNICODE_SEARCH, c);
        if (binarySearch > -1) {
            c2 = LinguisticAnalyzer.EXTENDED_UNICODE_REPLACE[binarySearch];
        }
        return c2;
    }

    public int getMaxTokens() {
        return this.maxTokens;
    }

    public void setMaxTokens(int i) {
        this.maxTokens = i;
    }
}
