package org.modeshape.extractor.tika;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.TreeSet;
import org.hamcrest.core.Is;
import org.hamcrest.core.IsNull;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.modeshape.common.FixFor;
import org.modeshape.common.util.FileUtil;
import org.modeshape.common.util.IoUtil;
import org.modeshape.jcr.InMemoryTestBinary;
import org.modeshape.jcr.mimetype.MimeTypeDetector;
import org.modeshape.jcr.mimetype.MimeTypeDetectors;
import org.modeshape.jcr.text.TextExtractorContext;
import org.modeshape.jcr.text.TextExtractorOutput;

/* loaded from: input_file:org/modeshape/extractor/tika/TikaTextExtractorTest.class */
public class TikaTextExtractorTest {
    private static final int DEFAULT_TIKA_WRITE_LIMIT = 100000;
    private static final String CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
    private TikaTextExtractor extractor;
    private LinkedList<String> extracted = null;
    private LinkedList<String> expected = null;
    private static final MimeTypeDetector DETECTOR = new MimeTypeDetectors();
    private static final Random RANDOM = new Random();

    @Before
    public void beforeEach() {
        this.extractor = new TikaTextExtractor();
        this.extractor.initialize();
        this.extracted = new LinkedList<>();
        this.expected = new LinkedList<>();
    }

    @Test
    public void shouldHavePredefinedMimeTypesByDefault() {
        Assert.assertThat(Boolean.valueOf(this.extractor.getIncludedMediaTypes().isEmpty()), Is.is(true));
        Assert.assertEquals(new TreeSet(TikaTextExtractor.DEFAULT_EXCLUDED_MIME_TYPES), new TreeSet(this.extractor.getExcludedMediaTypes()));
        Assert.assertFalse(this.extractor.getParserSupportedMediaTypes().isEmpty());
    }

    @Test
    public void shouldSupportExtractingFromTextFiles() throws Exception {
        Assert.assertThat(Boolean.valueOf(this.extractor.supportsMimeType("text/plain")), Is.is(true));
    }

    @Test
    public void shouldSupportExtractingFromPdfFiles() throws Exception {
        Assert.assertThat(Boolean.valueOf(this.extractor.supportsMimeType("application/pdf")), Is.is(true));
    }

    @Test
    public void shouldNotSupportExtractingFromPostscriptFiles() throws Exception {
        Assert.assertThat(Boolean.valueOf(this.extractor.supportsMimeType("application/postscript")), Is.is(false));
    }

    @Test
    public void shouldSupportExtractingFromDocWordFiles() throws Exception {
        Assert.assertThat(Boolean.valueOf(this.extractor.supportsMimeType("application/msword")), Is.is(true));
    }

    @Test
    public void shouldSupportExtractingFromDocxWordFiles() throws Exception {
        Assert.assertThat(Boolean.valueOf(this.extractor.supportsMimeType("application/vnd.openxmlformats-officedocument.wordprocessingml.document")), Is.is(true));
    }

    @Test
    public void shouldExtractTextFromTextFile1() throws Exception {
        extractTermsFrom("modeshape.txt");
        loadExpectedFrom("modeshape.txt");
        extractedShouldHave(remainingExpectedTerms());
    }

    @Test
    public void shouldExtractTextFromTextFile2() throws Exception {
        extractTermsFrom("text-file.txt");
        loadExpectedFrom("text-file.txt");
        extractedShouldHave(remainingExpectedTerms());
    }

    @Test
    public void shouldExtractTextFromDocFile() throws Exception {
        extractTermsFrom("modeshape.doc");
        loadExpectedFrom("modeshape.txt");
        extractedShouldHave(remainingExpectedTerms());
    }

    @Test
    public void shouldExtractTextFromDocxFile() throws Exception {
        extractTermsFrom("modeshape.docx");
        loadExpectedFrom("modeshape.txt");
    }

    @Test
    public void shouldExtractTextFromPdfFileGS() throws Exception {
        extractTermsFrom("modeshape_gs.pdf");
        assertExtractedMatchesExpected();
    }

    @Test
    @FixFor({"MODE-1561"})
    public void shouldExtractUsingWriteLimit() throws Exception {
        String randomString = randomString(100002);
        File createTempFile = File.createTempFile("tika_extraction_", ".txt");
        try {
            IoUtil.write(randomString, createTempFile);
            this.extractor.setWriteLimit(100002);
            TextExtractorOutput textExtractorOutput = new TextExtractorOutput();
            this.extractor.extractFrom(new InMemoryTestBinary(new FileInputStream(createTempFile)), textExtractorOutput, new TextExtractorContext(DETECTOR));
            Assert.assertEquals(randomString, textExtractorOutput.getText());
            FileUtil.delete(createTempFile);
        } catch (Throwable th) {
            FileUtil.delete(createTempFile);
            throw th;
        }
    }

    @Test
    @Ignore("Exposes the Tika/PDF box bug that characters get duplicated when parsing pdfs produced by PDF Context")
    public void shouldExtractTextFromPdfFilePdfContext() throws Exception {
        extractTermsFrom("modeshape_pdfcontext.pdf");
        assertExtractedMatchesExpected();
    }

    @Test
    @FixFor({"MODE-1810"})
    public void shouldExtractTextFromXlsxFile() throws Exception {
        extractTermsFrom("sample-file.xlsx");
        Assert.assertTrue(!this.extracted.isEmpty());
    }

    public static String randomString(int i) {
        StringBuilder sb = new StringBuilder(i);
        sb.append("this is a text file ");
        for (int i2 = 0; i2 < i - "this is a text file ".length(); i2++) {
            sb.append(CHARS.charAt(RANDOM.nextInt(CHARS.length())));
        }
        return sb.toString();
    }

    private void assertExtractedMatchesExpected() throws IOException {
        loadExpectedFrom("modeshape.txt");
        extractedShouldHave("2011-01-24");
        extractedShouldHave("-", "1/2", "-");
        extractedShouldHave(expectedTermsThrough("-", "versioning"));
        extractedShouldHave("2011-01-24");
        extractedShouldHave("-", "2/2", "-");
        extractedShouldHave(remainingExpectedTerms());
    }

    private List<String> remainingExpectedTerms() {
        return this.expected;
    }

    private void extractedShouldHave(String... strArr) {
        for (String str : strArr) {
            Assert.assertThat(this.extracted.pop(), Is.is(str));
        }
    }

    private void extractedShouldHave(List<String> list) {
        LinkedList linkedList = new LinkedList();
        for (String str : list) {
            try {
                Assert.assertThat(this.extracted.pop(), Is.is(str));
            } catch (NoSuchElementException e) {
                linkedList.add(str);
            }
        }
        Assert.assertThat("Missing words: " + linkedList, Integer.valueOf(linkedList.size()), Is.is(0));
    }

    private List<String> expectedTermsThrough(String... strArr) {
        if (strArr == null || strArr.length == 0) {
            return Collections.emptyList();
        }
        LinkedList linkedList = new LinkedList();
        String str = strArr[0];
        while (str != null && !this.expected.isEmpty()) {
            String pop = this.expected.pop();
            linkedList.add(pop);
            if (pop.equals(str)) {
                boolean z = true;
                int i = 1;
                while (true) {
                    if (i == strArr.length) {
                        break;
                    }
                    String pop2 = this.expected.pop();
                    linkedList.add(pop2);
                    if (!pop2.equals(strArr[i])) {
                        z = false;
                        break;
                    }
                    i++;
                }
                if (z) {
                    return linkedList;
                }
            }
        }
        return linkedList;
    }

    private void extractTermsFrom(String str) throws Exception {
        InputStream resourceAsStream = getClass().getClassLoader().getResourceAsStream(str);
        Assert.assertThat(resourceAsStream, Is.is(IsNull.notNullValue()));
        TextExtractorOutput textExtractorOutput = new TextExtractorOutput();
        this.extractor.extractFrom(new InMemoryTestBinary(resourceAsStream), textExtractorOutput, new TextExtractorContext(DETECTOR));
        textExtractorOutput.toString();
        addWords(this.extracted, textExtractorOutput.getText());
    }

    private void loadExpectedFrom(String str) throws IOException {
        InputStream resourceAsStream = getClass().getClassLoader().getResourceAsStream(str);
        Assert.assertThat(resourceAsStream, Is.is(IsNull.notNullValue()));
        try {
            addWords(this.expected, IoUtil.read(resourceAsStream));
            resourceAsStream.close();
        } catch (Throwable th) {
            resourceAsStream.close();
            throw th;
        }
    }

    private void addWords(List<String> list, String str) {
        for (String str2 : str.split("[\\s\"]+")) {
            if (str2.length() > 0) {
                list.add(str2);
            }
        }
    }
}
