/*
 * Decompiled with CFR 0.152.
 */
package ai.djl.modality.nlp.bert;

import ai.djl.modality.nlp.SimpleVocabulary;
import ai.djl.modality.nlp.preprocess.SimpleTokenizer;
import java.util.ArrayList;
import java.util.List;

public class WordpieceTokenizer
extends SimpleTokenizer {
    private String unknown;
    private int maxInputChars;
    private SimpleVocabulary vocabulary;

    public WordpieceTokenizer(SimpleVocabulary vocabulary, String unknown, int maxInputChars) {
        this.unknown = unknown;
        this.maxInputChars = maxInputChars;
        this.vocabulary = vocabulary;
    }

    @Override
    public List<String> tokenize(String sentence) {
        StringBuilder sb = new StringBuilder();
        ArrayList<String> subTokens = new ArrayList<String>();
        ArrayList<String> outputTokens = new ArrayList<String>();
        for (String token : super.tokenize(sentence.trim())) {
            char[] chars = token.toCharArray();
            if (chars.length > this.maxInputChars) {
                outputTokens.add(this.unknown);
                continue;
            }
            boolean isBad = false;
            int start = 0;
            subTokens.clear();
            String currentSubString = null;
            while (start < chars.length) {
                int end;
                for (end = chars.length; start < end; --end) {
                    String subString;
                    sb.setLength(0);
                    sb.append(token, start, end);
                    if (start > 0) {
                        sb.insert(0, "##");
                    }
                    if (this.vocabulary.contains(subString = sb.toString())) {
                        currentSubString = subString;
                        break;
                    }
                    currentSubString = null;
                }
                if (currentSubString == null) {
                    isBad = true;
                    break;
                }
                subTokens.add(currentSubString);
                if (subTokens.size() > this.maxInputChars) {
                    throw new IllegalStateException("Too many subTokens for: '" + sentence + '\'');
                }
                start = end;
            }
            if (isBad) {
                outputTokens.add(this.unknown);
                continue;
            }
            outputTokens.addAll(subTokens);
        }
        return outputTokens;
    }
}

