package org.semanticdesktop.aperture.crawler.web;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.apache.lucene.store.BufferedIndexInput;
import org.ontoware.aifbcommons.collection.ClosableIterator;
import org.ontoware.rdf2go.exception.ModelRuntimeException;
import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.ontoware.rdf2go.vocabulary.RDF;
import org.semanticdesktop.aperture.accessor.AccessData;
import org.semanticdesktop.aperture.accessor.DataAccessor;
import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
import org.semanticdesktop.aperture.accessor.DataObject;
import org.semanticdesktop.aperture.accessor.FileDataObject;
import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
import org.semanticdesktop.aperture.accessor.base.FilterAccessData;
import org.semanticdesktop.aperture.crawler.ExitCode;
import org.semanticdesktop.aperture.crawler.base.CrawlerBase;
import org.semanticdesktop.aperture.datasource.config.DomainBoundaries;
import org.semanticdesktop.aperture.datasource.web.WebDataSource;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractor;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractorFactory;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractorRegistry;
import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
import org.semanticdesktop.aperture.util.IOUtil;
import org.semanticdesktop.aperture.util.UrlUtil;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/aperture-1.1.0.Beta1.jar:org/semanticdesktop/aperture/crawler/web/WebCrawler.class */
public class WebCrawler extends CrawlerBase {
    private MimeTypeIdentifier mimeTypeIdentifier;
    private LinkExtractorRegistry linkExtractorRegistry;
    private long maxByteSize;
    private Boolean includeEmbeddedResources;
    private DomainBoundaries domainBoundaries;
    private LinkedList<CrawlJob> jobsQueue;
    private HashMap<String, CrawlJob> jobsMap;
    private HashSet<String> crawledUrls;
    private int initialDepth;
    private Logger logger = LoggerFactory.getLogger(getClass());
    private WebAccessData wad = null;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:WEB-INF/lib/aperture-1.1.0.Beta1.jar:org/semanticdesktop/aperture/crawler/web/WebCrawler$StringUriPair.class */
    public static class StringUriPair {
        private String string;
        private URI uri;

        public StringUriPair(String str, URI uri) {
            this.string = str;
            this.uri = uri;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:WEB-INF/lib/aperture-1.1.0.Beta1.jar:org/semanticdesktop/aperture/crawler/web/WebCrawler$WebAccessData.class */
    public class WebAccessData extends FilterAccessData {
        public WebAccessData(AccessData accessData) {
            super(accessData);
        }

        @Override // org.semanticdesktop.aperture.accessor.base.FilterAccessData, org.semanticdesktop.aperture.accessor.AccessData
        public void put(String str, String str2, String str3) {
            if (AccessData.REDIRECTS_TO_KEY.equals(str2)) {
                touch(str);
                CrawlJob crawlJob = (CrawlJob) WebCrawler.this.jobsMap.remove(str);
                if (crawlJob != null) {
                    WebCrawler.this.jobsQueue.remove(crawlJob);
                }
            }
            super.put(str, str2, str3);
        }
    }

    public void setMimeTypeIdentifier(MimeTypeIdentifier mimeTypeIdentifier) {
        this.mimeTypeIdentifier = mimeTypeIdentifier;
    }

    public MimeTypeIdentifier getMimeTypeIdentifier() {
        return this.mimeTypeIdentifier;
    }

    public void setLinkExtractorRegistry(LinkExtractorRegistry linkExtractorRegistry) {
        this.linkExtractorRegistry = linkExtractorRegistry;
    }

    public LinkExtractorRegistry getLinkExtractorRegistry() {
        return this.linkExtractorRegistry;
    }

    @Override // org.semanticdesktop.aperture.crawler.base.CrawlerBase
    protected ExitCode crawlObjects() {
        initialize();
        processQueue();
        removeDeprecatedRedirections();
        boolean isEmpty = this.jobsQueue.isEmpty();
        cleanUp();
        return isEmpty ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
    }

    private void initialize() {
        if (this.mimeTypeIdentifier == null) {
            throw new IllegalArgumentException("MimeTypeIdentifier missing");
        }
        if (this.linkExtractorRegistry == null) {
            throw new IllegalArgumentException("LinkExtractorRegistry missing");
        }
        this.jobsQueue = new LinkedList<>();
        this.jobsMap = new HashMap<>(BufferedIndexInput.BUFFER_SIZE);
        if (this.accessData == null) {
            this.crawledUrls = new HashSet<>(BufferedIndexInput.BUFFER_SIZE);
        } else {
            this.wad = new WebAccessData(this.accessData);
        }
        WebDataSource webDataSource = (WebDataSource) getDataSource();
        String rootUrl = webDataSource.getRootUrl();
        this.domainBoundaries = webDataSource.getDomainBoundaries();
        this.includeEmbeddedResources = webDataSource.getIncludeEmbeddedResources();
        Integer maximumDepth = webDataSource.getMaximumDepth();
        int intValue = maximumDepth == null ? Integer.MAX_VALUE : maximumDepth.intValue();
        this.initialDepth = intValue;
        Long maximumSize = webDataSource.getMaximumSize();
        this.maxByteSize = maximumSize == null ? Long.MAX_VALUE : maximumSize.longValue();
        schedule(rootUrl, intValue, false);
    }

    private void schedule(String str, int i, boolean z) {
        String str2;
        if (str == null || (str2 = normalizeAndFixURL(str, null).string) == null || isCrawled(str2)) {
            return;
        }
        if (!z || this.domainBoundaries.inDomain(str2)) {
            CrawlJob crawlJob = this.jobsMap.get(str2);
            if (crawlJob == null) {
                crawlJob = new CrawlJob(str2, i);
                this.jobsMap.put(str2, crawlJob);
            } else {
                if (crawlJob.getDepth() >= i) {
                    return;
                }
                crawlJob.setDepth(i);
                this.jobsQueue.remove(crawlJob);
            }
            ListIterator<CrawlJob> listIterator = this.jobsQueue.listIterator(this.jobsQueue.size());
            while (true) {
                if (!listIterator.hasPrevious()) {
                    break;
                } else if (listIterator.previous().getDepth() >= i) {
                    listIterator.next();
                    break;
                }
            }
            listIterator.add(crawlJob);
        }
    }

    private boolean isCrawled(String str) {
        return this.wad != null ? this.wad.isTouched(str) : this.crawledUrls.contains(str);
    }

    private void addCrawled(String str) {
        if (this.wad != null) {
            this.wad.touch(str);
        } else {
            this.crawledUrls.add(str);
        }
    }

    private void processQueue() {
        while (!this.jobsQueue.isEmpty() && !isStopRequested()) {
            CrawlJob removeFirst = this.jobsQueue.removeFirst();
            String url = removeFirst.getURL();
            int depth = removeFirst.getDepth();
            if (this.logger.isDebugEnabled()) {
                try {
                    new java.net.URI(url);
                } catch (URISyntaxException e) {
                    this.logger.debug("Faulty url: " + url);
                }
            }
            reportAccessingObject(url);
            boolean isKnownId = this.accessData == null ? false : this.accessData.isKnownId(url);
            addCrawled(url);
            this.jobsMap.remove(url);
            DataAccessor dataAccessor = getDataAccessor(url);
            if (dataAccessor != null) {
                try {
                    DataObject dataObjectIfModified = dataAccessor.getDataObjectIfModified(url, this.source, this.wad, null, getRDFContainerFactory(url));
                    if (dataObjectIfModified == null) {
                        reportUnmodifiedDataObject(url);
                        if (depth > 0) {
                            scheduleCachedLinks(url, depth - 1);
                        }
                    } else {
                        if (depth == this.initialDepth) {
                            dataObjectIfModified.getMetadata().add(NIE.rootElementOf, this.source.getID());
                        }
                        String obj = dataObjectIfModified.getID().toString();
                        if (!obj.equals(url)) {
                            CrawlJob remove = this.jobsMap.remove(obj);
                            if (remove != null) {
                                this.jobsQueue.remove(remove);
                            }
                            if (isCrawled(obj)) {
                                dataObjectIfModified.dispose();
                            } else {
                                addCrawled(obj);
                            }
                        }
                        if (hasAcceptableByteSize(dataObjectIfModified)) {
                            if (dataObjectIfModified instanceof FileDataObject) {
                                processLinks((FileDataObject) dataObjectIfModified, depth - 1);
                            }
                            if (isKnownId) {
                                reportModifiedDataObject(dataObjectIfModified);
                            } else {
                                reportNewDataObject(dataObjectIfModified);
                            }
                        } else {
                            unregisterUrl(url, isKnownId);
                        }
                    }
                } catch (UrlNotFoundException e2) {
                    unregisterUrl(url, isKnownId);
                } catch (IOException e3) {
                    this.logger.info("I/O error while accessing " + url, (Throwable) e3);
                } catch (Exception e4) {
                    this.logger.info("Error while accessing " + url, (Throwable) e4);
                }
            }
        }
    }

    private boolean hasAcceptableByteSize(DataObject dataObject) {
        Long l;
        return this.maxByteSize == Long.MAX_VALUE || (l = dataObject.getMetadata().getLong(NIE.byteSize)) == null || l.longValue() <= this.maxByteSize;
    }

    private void unregisterUrl(String str, boolean z) {
        if (z) {
            reportDeletedDataObject(str);
        } else if (this.accessData != null) {
            this.accessData.remove(str);
        }
    }

    private DataAccessor getDataAccessor(String str) {
        int indexOf = str.indexOf(58);
        if (indexOf <= 0) {
            return null;
        }
        Set set = this.accessorRegistry.get(str.substring(0, indexOf));
        if (set.isEmpty()) {
            return null;
        }
        return ((DataAccessorFactory) set.iterator().next()).get();
    }

    private void scheduleCachedLinks(String str, int i) {
        if (this.accessData == null) {
            this.logger.error("Internal error: scheduling cached links for unmodified url while no AccessData is set: " + str);
            return;
        }
        String str2 = this.accessData.get(str, AccessData.REDIRECTS_TO_KEY);
        if (str2 != null) {
            str = str2;
        }
        Set referredIDs = this.accessData.getReferredIDs(str);
        if (referredIDs != null) {
            Iterator it = referredIDs.iterator();
            while (it.hasNext()) {
                schedule((String) it.next(), i, true);
            }
        }
    }

    private void processLinks(FileDataObject fileDataObject, int i) {
        String mimeType;
        LinkExtractor linkExtractor;
        InputStream byteArrayContent;
        List<String> links;
        String obj = fileDataObject.getID().toString();
        if (this.accessData != null) {
            this.accessData.removeReferredIDs(obj);
        }
        InputStream markSupportingContent = getMarkSupportingContent(fileDataObject);
        if (markSupportingContent == null || (mimeType = getMimeType(markSupportingContent, fileDataObject)) == null || (linkExtractor = getLinkExtractor(mimeType)) == null || (byteArrayContent = getByteArrayContent(markSupportingContent, fileDataObject)) == null || (links = getLinks(byteArrayContent, linkExtractor, obj)) == null) {
            return;
        }
        HashSet hashSet = new HashSet(links.size());
        Iterator<String> it = links.iterator();
        while (it.hasNext()) {
            StringUriPair normalizeAndFixURL = normalizeAndFixURL(it.next(), fileDataObject.getMetadata().getModel());
            String str = normalizeAndFixURL.string;
            URI uri = normalizeAndFixURL.uri;
            if (str != null && !obj.equals(str) && !hashSet.contains(str)) {
                if (i >= 0) {
                    if (str != null) {
                        schedule(str, i, true);
                        if (uri != null) {
                            fileDataObject.getMetadata().add(NIE.links, uri);
                            fileDataObject.getMetadata().getModel().addStatement(uri, RDF.type, NIE.DataObject);
                            hashSet.add(str);
                        }
                    } else {
                        this.logger.warn("WebCrawler is skipping link {}", str);
                    }
                }
                if (this.accessData != null) {
                    this.accessData.putReferredID(obj, str);
                }
            }
        }
    }

    private List<String> getLinks(InputStream inputStream, LinkExtractor linkExtractor, String str) {
        try {
            try {
                inputStream.mark(Integer.MAX_VALUE);
                HashMap hashMap = new HashMap();
                hashMap.put(LinkExtractor.BASE_URL_KEY, str);
                if (this.includeEmbeddedResources != null) {
                    hashMap.put(LinkExtractor.INCLUDE_EMBEDDED_RESOURCES_KEY, this.includeEmbeddedResources);
                }
                List extractLinks = linkExtractor.extractLinks(inputStream, hashMap);
                try {
                    inputStream.reset();
                } catch (IOException e) {
                    this.logger.warn("internal error: IOException while resetting a ByteArrayInputStream", (Throwable) e);
                }
                return extractLinks;
            } catch (Throwable th) {
                try {
                    inputStream.reset();
                } catch (IOException e2) {
                    this.logger.warn("internal error: IOException while resetting a ByteArrayInputStream", (Throwable) e2);
                }
                throw th;
            }
        } catch (Exception e3) {
            this.logger.info("IOException while extracting links", (Throwable) e3);
            try {
                inputStream.reset();
                return null;
            } catch (IOException e4) {
                this.logger.warn("internal error: IOException while resetting a ByteArrayInputStream", (Throwable) e4);
                return null;
            }
        }
    }

    private InputStream getMarkSupportingContent(FileDataObject fileDataObject) {
        try {
            InputStream content = fileDataObject.getContent();
            if (!content.markSupported()) {
                content = new BufferedInputStream(content);
            }
            return content;
        } catch (IOException e) {
            this.logger.info("IOException while obtaining the object content", (Throwable) e);
            fileDataObject.setContent(null);
            return null;
        }
    }

    private InputStream getByteArrayContent(InputStream inputStream, FileDataObject fileDataObject) {
        if (inputStream instanceof ByteArrayInputStream) {
            return inputStream;
        }
        try {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(IOUtil.readBytes(inputStream));
            fileDataObject.setContent(byteArrayInputStream);
            return byteArrayInputStream;
        } catch (IOException e) {
            this.logger.warn("IOException while buffering document", (Throwable) e);
            fileDataObject.setContent(null);
            return null;
        }
    }

    private String getMimeType(InputStream inputStream, FileDataObject fileDataObject) {
        String str = null;
        try {
            int minArrayLength = this.mimeTypeIdentifier.getMinArrayLength();
            inputStream.mark(minArrayLength);
            try {
                str = this.mimeTypeIdentifier.identify(IOUtil.readBytes(inputStream, minArrayLength), null, fileDataObject.getID());
                inputStream.reset();
            } catch (Throwable th) {
                inputStream.reset();
                throw th;
            }
        } catch (IOException e) {
            this.logger.debug("IOError while determining the mime type", (Throwable) e);
            try {
                inputStream.close();
            } catch (Exception e2) {
            }
            fileDataObject.setContent(null);
        }
        if (str == null) {
            str = fileDataObject.getMetadata().getString(NIE.mimeType);
        } else {
            fileDataObject.getMetadata().put(NIE.mimeType, str);
        }
        return str;
    }

    private LinkExtractor getLinkExtractor(String str) {
        Set set = this.linkExtractorRegistry.get(str);
        if (set.isEmpty()) {
            return null;
        }
        return ((LinkExtractorFactory) set.iterator().next()).get();
    }

    private StringUriPair normalizeAndFixURL(String str, Model model) {
        URI uri;
        String str2 = str;
        if (str.startsWith("file:") || str.startsWith("http:") || str.startsWith("https:")) {
            try {
                str2 = UrlUtil.normalizeURL(new URL(str)).toExternalForm();
            } catch (MalformedURLException e) {
                return new StringUriPair(null, null);
            }
        }
        try {
            uri = model != null ? model.createURI(str2) : new URIImpl(str2);
        } catch (IllegalArgumentException e2) {
            try {
                if (str2.startsWith("file:") || str2.startsWith("http:") || str2.startsWith("https:")) {
                    try {
                        URL url = new URL(str2);
                        str2 = new java.net.URI(url.getProtocol(), url.getAuthority(), url.getPath(), url.getQuery(), url.getRef()).toString();
                        uri = model.createURI(str2);
                    } catch (MalformedURLException e3) {
                        str2 = null;
                        uri = null;
                    } catch (URISyntaxException e4) {
                        str2 = null;
                        uri = null;
                    }
                } else {
                    str2 = null;
                    uri = null;
                }
            } catch (ModelRuntimeException e5) {
                this.logger.debug("Unable to create URI for link {}", str2);
                str2 = null;
                uri = null;
            }
        }
        return new StringUriPair(str2, uri);
    }

    private void removeDeprecatedRedirections() {
        if (this.accessData != null) {
            HashSet<String> hashSet = new HashSet();
            ClosableIterator untouchedIDsIterator = this.accessData.getUntouchedIDsIterator();
            while (untouchedIDsIterator.hasNext()) {
                String obj = untouchedIDsIterator.next().toString();
                if (this.accessData.get(obj, AccessData.REDIRECTS_TO_KEY) != null) {
                    hashSet.add(obj);
                }
            }
            for (String str : hashSet) {
                this.accessData.touch(str);
                this.accessData.remove(str, AccessData.REDIRECTS_TO_KEY);
            }
        }
    }

    private void cleanUp() {
        this.domainBoundaries = null;
        this.jobsQueue = null;
        this.jobsMap = null;
        this.crawledUrls = null;
        this.includeEmbeddedResources = null;
    }
}
