/*
 * Decompiled with CFR 0.152.
 */
package org.modeshape.extractor.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.TreeSet;
import org.hamcrest.Matcher;
import org.hamcrest.core.Is;
import org.hamcrest.core.IsNull;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.modeshape.common.FixFor;
import org.modeshape.common.util.FileUtil;
import org.modeshape.common.util.IoUtil;
import org.modeshape.extractor.tika.TikaTextExtractor;
import org.modeshape.jcr.Environment;
import org.modeshape.jcr.InMemoryTestBinary;
import org.modeshape.jcr.TestingEnvironment;
import org.modeshape.jcr.api.Binary;
import org.modeshape.jcr.api.text.TextExtractor;
import org.modeshape.jcr.mimetype.ContentDetector;
import org.modeshape.jcr.mimetype.MimeTypeDetector;
import org.modeshape.jcr.text.TextExtractorContext;
import org.modeshape.jcr.text.TextExtractorOutput;

public class TikaTextExtractorTest {
    private static final MimeTypeDetector DETECTOR = new ContentDetector((Environment)new TestingEnvironment());
    private static final int DEFAULT_TIKA_WRITE_LIMIT = 100000;
    private static final String CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    private static final Random RANDOM = new Random();
    private TikaTextExtractor extractor;
    private LinkedList<String> extracted = null;
    private LinkedList<String> expected = null;

    @Before
    public void beforeEach() {
        this.extractor = new TikaTextExtractor();
        this.extractor.initialize();
        this.extracted = new LinkedList();
        this.expected = new LinkedList();
    }

    @Test
    public void shouldHavePredefinedMimeTypesByDefault() {
        Assert.assertThat((Object)this.extractor.getIncludedMediaTypes().isEmpty(), (Matcher)Is.is((Object)true));
        Assert.assertEquals(new TreeSet(TikaTextExtractor.DEFAULT_EXCLUDED_MIME_TYPES), new TreeSet(this.extractor.getExcludedMediaTypes()));
        Assert.assertFalse((boolean)this.extractor.getParserSupportedMediaTypes().isEmpty());
    }

    @Test
    public void shouldSupportExtractingFromTextFiles() throws Exception {
        Assert.assertThat((Object)this.extractor.supportsMimeType("text/plain"), (Matcher)Is.is((Object)true));
    }

    @Test
    public void shouldSupportExtractingFromPdfFiles() throws Exception {
        Assert.assertThat((Object)this.extractor.supportsMimeType("application/pdf"), (Matcher)Is.is((Object)true));
    }

    @Test
    public void shouldNotSupportExtractingFromPostscriptFiles() throws Exception {
        Assert.assertThat((Object)this.extractor.supportsMimeType("application/postscript"), (Matcher)Is.is((Object)false));
    }

    @Test
    public void shouldSupportExtractingFromDocWordFiles() throws Exception {
        Assert.assertThat((Object)this.extractor.supportsMimeType("application/msword"), (Matcher)Is.is((Object)true));
    }

    @Test
    public void shouldSupportExtractingFromDocxWordFiles() throws Exception {
        Assert.assertThat((Object)this.extractor.supportsMimeType("application/vnd.openxmlformats-officedocument.wordprocessingml.document"), (Matcher)Is.is((Object)true));
    }

    @Test
    public void shouldExtractTextFromTextFile1() throws Exception {
        this.extractTermsFrom("modeshape.txt");
        this.loadExpectedFrom("modeshape.txt");
        this.extractedShouldHave(this.remainingExpectedTerms());
    }

    @Test
    public void shouldExtractTextFromTextFile2() throws Exception {
        this.extractTermsFrom("text-file.txt");
        this.loadExpectedFrom("text-file.txt");
        this.extractedShouldHave(this.remainingExpectedTerms());
    }

    @Test
    public void shouldExtractTextFromDocFile() throws Exception {
        this.extractTermsFrom("modeshape.doc");
        this.loadExpectedFrom("modeshape.txt");
        this.extractedShouldHave(this.remainingExpectedTerms());
    }

    @Test
    public void shouldExtractTextFromDocxFile() throws Exception {
        this.extractTermsFrom("modeshape.docx");
        this.loadExpectedFrom("modeshape.txt");
    }

    @Test
    public void shouldExtractTextFromPdfFileGS() throws Exception {
        this.extractTermsFrom("modeshape_gs.pdf");
        this.assertExtractedMatchesExpected();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Test
    @FixFor(value={"MODE-1561"})
    public void shouldExtractUsingWriteLimit() throws Exception {
        int stringLength = 100002;
        String rndString = TikaTextExtractorTest.randomString(stringLength);
        File tempFile = File.createTempFile("tika_extraction_", ".txt");
        try {
            IoUtil.write((String)rndString, (File)tempFile);
            this.extractor.setWriteLimit(Integer.valueOf(stringLength));
            TextExtractorOutput output = new TextExtractorOutput();
            this.extractor.extractFrom((Binary)new InMemoryTestBinary((InputStream)new FileInputStream(tempFile)), (TextExtractor.Output)output, (TextExtractor.Context)new TextExtractorContext(DETECTOR));
            Assert.assertEquals((Object)rndString, (Object)output.getText());
        }
        finally {
            FileUtil.delete((File)tempFile);
        }
    }

    @Test
    @Ignore(value="Exposes the Tika/PDF box bug that characters get duplicated when parsing pdfs produced by PDF Context")
    public void shouldExtractTextFromPdfFilePdfContext() throws Exception {
        this.extractTermsFrom("modeshape_pdfcontext.pdf");
        this.assertExtractedMatchesExpected();
    }

    @Test
    @FixFor(value={"MODE-1810"})
    public void shouldExtractTextFromXlsxFile() throws Exception {
        this.extractTermsFrom("sample-file.xlsx");
        Assert.assertTrue((!this.extracted.isEmpty() ? 1 : 0) != 0);
    }

    public static String randomString(int length) {
        String header = "this is a text file ";
        StringBuilder rndStringBuilder = new StringBuilder(length);
        rndStringBuilder.append(header);
        for (int i = 0; i < length - header.length(); ++i) {
            rndStringBuilder.append(CHARS.charAt(RANDOM.nextInt(CHARS.length())));
        }
        return rndStringBuilder.toString();
    }

    private void assertExtractedMatchesExpected() throws IOException {
        this.loadExpectedFrom("modeshape.txt");
        this.extractedShouldHave("2011-01-24");
        this.extractedShouldHave("-", "1/2", "-");
        this.extractedShouldHave(this.expectedTermsThrough("-", "versioning"));
        this.extractedShouldHave("2011-01-24");
        this.extractedShouldHave("-", "2/2", "-");
        this.extractedShouldHave(this.remainingExpectedTerms());
    }

    private List<String> remainingExpectedTerms() {
        return this.expected;
    }

    private void extractedShouldHave(String ... words) {
        for (String word : words) {
            Assert.assertThat((Object)this.extracted.pop(), (Matcher)Is.is((Object)word));
        }
    }

    private void extractedShouldHave(List<String> words) {
        LinkedList<String> missingWords = new LinkedList<String>();
        for (String word : words) {
            String extractedWord = null;
            try {
                extractedWord = this.extracted.pop();
            }
            catch (NoSuchElementException e) {
                missingWords.add(word);
                continue;
            }
            Assert.assertThat((Object)extractedWord, (Matcher)Is.is((Object)word));
        }
        Assert.assertThat((String)("Missing words: " + missingWords), (Object)missingWords.size(), (Matcher)Is.is((Object)0));
    }

    private List<String> expectedTermsThrough(String ... words) {
        if (words == null || words.length == 0) {
            return Collections.emptyList();
        }
        LinkedList<String> result = new LinkedList<String>();
        String nextWord = words[0];
        while (nextWord != null && !this.expected.isEmpty()) {
            String word = this.expected.pop();
            result.add(word);
            if (!word.equals(nextWord)) continue;
            boolean foundAll = true;
            for (int i = 1; i != words.length; ++i) {
                String next = this.expected.pop();
                result.add(next);
                if (next.equals(words[i])) continue;
                foundAll = false;
                break;
            }
            if (!foundAll) continue;
            return result;
        }
        return result;
    }

    private void extractTermsFrom(String resourcePath) throws Exception {
        InputStream stream = this.getClass().getClassLoader().getResourceAsStream(resourcePath);
        Assert.assertThat((Object)stream, (Matcher)Is.is((Matcher)IsNull.notNullValue()));
        TextExtractorOutput output = new TextExtractorOutput();
        this.extractor.extractFrom((Binary)new InMemoryTestBinary(stream), (TextExtractor.Output)output, (TextExtractor.Context)new TextExtractorContext(DETECTOR));
        output.toString();
        this.addWords(this.extracted, output.getText());
    }

    private void loadExpectedFrom(String resourcePath) throws IOException {
        Assert.assertThat((Object)stream, (Matcher)Is.is((Matcher)IsNull.notNullValue()));
        try (InputStream stream = this.getClass().getClassLoader().getResourceAsStream(resourcePath);){
            this.addWords(this.expected, IoUtil.read((InputStream)stream));
        }
    }

    private void addWords(List<String> words, String input) {
        for (String word : input.split("[\\s\"]+")) {
            if (word.length() <= 0) continue;
            words.add(word);
        }
    }
}

