/*
 * Decompiled with CFR 0.152.
 */
package org.semanticdesktop.aperture.crawler.web;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.ontoware.aifbcommons.collection.ClosableIterator;
import org.ontoware.rdf2go.exception.ModelRuntimeException;
import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.node.Node;
import org.ontoware.rdf2go.model.node.Resource;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.ontoware.rdf2go.vocabulary.RDF;
import org.semanticdesktop.aperture.accessor.AccessData;
import org.semanticdesktop.aperture.accessor.DataAccessor;
import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
import org.semanticdesktop.aperture.accessor.DataObject;
import org.semanticdesktop.aperture.accessor.FileDataObject;
import org.semanticdesktop.aperture.accessor.RDFContainerFactory;
import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
import org.semanticdesktop.aperture.accessor.base.FilterAccessData;
import org.semanticdesktop.aperture.crawler.ExitCode;
import org.semanticdesktop.aperture.crawler.base.CrawlerBase;
import org.semanticdesktop.aperture.crawler.web.CrawlJob;
import org.semanticdesktop.aperture.datasource.config.DomainBoundaries;
import org.semanticdesktop.aperture.datasource.web.WebDataSource;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractor;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractorFactory;
import org.semanticdesktop.aperture.hypertext.linkextractor.LinkExtractorRegistry;
import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
import org.semanticdesktop.aperture.util.IOUtil;
import org.semanticdesktop.aperture.util.UrlUtil;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class WebCrawler
extends CrawlerBase {
    private Logger logger = LoggerFactory.getLogger(this.getClass());
    private MimeTypeIdentifier mimeTypeIdentifier;
    private LinkExtractorRegistry linkExtractorRegistry;
    private long maxByteSize;
    private Boolean includeEmbeddedResources;
    private DomainBoundaries domainBoundaries;
    private LinkedList<CrawlJob> jobsQueue;
    private HashMap<String, CrawlJob> jobsMap;
    private HashSet<String> crawledUrls;
    private int initialDepth;
    private WebAccessData wad = null;

    public void setMimeTypeIdentifier(MimeTypeIdentifier mimeTypeIdentifier) {
        this.mimeTypeIdentifier = mimeTypeIdentifier;
    }

    public MimeTypeIdentifier getMimeTypeIdentifier() {
        return this.mimeTypeIdentifier;
    }

    public void setLinkExtractorRegistry(LinkExtractorRegistry linkExtractorRegistry) {
        this.linkExtractorRegistry = linkExtractorRegistry;
    }

    public LinkExtractorRegistry getLinkExtractorRegistry() {
        return this.linkExtractorRegistry;
    }

    @Override
    protected ExitCode crawlObjects() {
        this.initialize();
        this.processQueue();
        this.removeDeprecatedRedirections();
        boolean bl = this.jobsQueue.isEmpty();
        this.cleanUp();
        return bl ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
    }

    private void initialize() {
        int n;
        if (this.mimeTypeIdentifier == null) {
            throw new IllegalArgumentException("MimeTypeIdentifier missing");
        }
        if (this.linkExtractorRegistry == null) {
            throw new IllegalArgumentException("LinkExtractorRegistry missing");
        }
        this.jobsQueue = new LinkedList();
        this.jobsMap = new HashMap(1024);
        if (this.accessData == null) {
            this.crawledUrls = new HashSet(1024);
        } else {
            this.wad = new WebAccessData(this.accessData);
        }
        WebDataSource webDataSource = (WebDataSource)this.getDataSource();
        String string = webDataSource.getRootUrl();
        this.domainBoundaries = webDataSource.getDomainBoundaries();
        this.includeEmbeddedResources = webDataSource.getIncludeEmbeddedResources();
        Integer n2 = webDataSource.getMaximumDepth();
        this.initialDepth = n = n2 == null ? Integer.MAX_VALUE : n2;
        Long l = webDataSource.getMaximumSize();
        this.maxByteSize = l == null ? Long.MAX_VALUE : l;
        this.schedule(string, n, false);
    }

    private void schedule(String string, int n, boolean bl) {
        if (string == null) {
            return;
        }
        if ((string = this.normalizeAndFixURL(string, null).string) == null) {
            return;
        }
        if (this.isCrawled(string)) {
            return;
        }
        if (bl && !this.domainBoundaries.inDomain(string)) {
            return;
        }
        CrawlJob crawlJob = this.jobsMap.get(string);
        if (crawlJob == null) {
            crawlJob = new CrawlJob(string, n);
            this.jobsMap.put(string, crawlJob);
        } else {
            if (crawlJob.getDepth() >= n) {
                return;
            }
            crawlJob.setDepth(n);
            this.jobsQueue.remove(crawlJob);
        }
        ListIterator<CrawlJob> listIterator = this.jobsQueue.listIterator(this.jobsQueue.size());
        while (listIterator.hasPrevious()) {
            CrawlJob crawlJob2 = listIterator.previous();
            if (crawlJob2.getDepth() < n) continue;
            listIterator.next();
            break;
        }
        listIterator.add(crawlJob);
    }

    private boolean isCrawled(String string) {
        if (this.wad != null) {
            return this.wad.isTouched(string);
        }
        return this.crawledUrls.contains(string);
    }

    private void addCrawled(String string) {
        if (this.wad != null) {
            this.wad.touch(string);
        } else {
            this.crawledUrls.add(string);
        }
    }

    private void processQueue() {
        while (!this.jobsQueue.isEmpty() && !this.isStopRequested()) {
            CrawlJob crawlJob = this.jobsQueue.removeFirst();
            String string = crawlJob.getURL();
            int n = crawlJob.getDepth();
            if (this.logger.isDebugEnabled()) {
                try {
                    java.net.URI uRI = new java.net.URI(string);
                }
                catch (URISyntaxException uRISyntaxException) {
                    this.logger.debug("Faulty url: " + string);
                }
            }
            this.reportAccessingObject(string);
            boolean bl = this.accessData == null ? false : this.accessData.isKnownId(string);
            this.addCrawled(string);
            this.jobsMap.remove(string);
            DataAccessor dataAccessor = this.getDataAccessor(string);
            if (dataAccessor == null) continue;
            try {
                String string2;
                RDFContainerFactory rDFContainerFactory = this.getRDFContainerFactory(string);
                DataObject dataObject = dataAccessor.getDataObjectIfModified(string, this.source, this.wad, null, rDFContainerFactory);
                if (dataObject == null) {
                    this.reportUnmodifiedDataObject(string);
                    if (n <= 0) continue;
                    this.scheduleCachedLinks(string, n - 1);
                    continue;
                }
                if (n == this.initialDepth) {
                    dataObject.getMetadata().add(NIE.rootElementOf, (Node)this.source.getID());
                }
                if (!(string2 = dataObject.getID().toString()).equals(string)) {
                    CrawlJob crawlJob2 = this.jobsMap.remove(string2);
                    if (crawlJob2 != null) {
                        this.jobsQueue.remove(crawlJob2);
                    }
                    if (this.isCrawled(string2)) {
                        dataObject.dispose();
                        continue;
                    }
                    this.addCrawled(string2);
                }
                if (this.hasAcceptableByteSize(dataObject)) {
                    if (dataObject instanceof FileDataObject) {
                        this.processLinks((FileDataObject)dataObject, n - 1);
                    }
                    if (bl) {
                        this.reportModifiedDataObject(dataObject);
                        continue;
                    }
                    this.reportNewDataObject(dataObject);
                    continue;
                }
                this.unregisterUrl(string, bl);
            }
            catch (UrlNotFoundException urlNotFoundException) {
                this.unregisterUrl(string, bl);
            }
            catch (IOException iOException) {
                this.logger.info("I/O error while accessing " + string, (Throwable)iOException);
            }
            catch (Exception exception) {
                this.logger.info("Error while accessing " + string, (Throwable)exception);
            }
        }
    }

    private boolean hasAcceptableByteSize(DataObject dataObject) {
        if (this.maxByteSize == Long.MAX_VALUE) {
            return true;
        }
        Long l = dataObject.getMetadata().getLong(NIE.byteSize);
        return l == null ? true : l <= this.maxByteSize;
    }

    private void unregisterUrl(String string, boolean bl) {
        if (bl) {
            this.reportDeletedDataObject(string);
        } else if (this.accessData != null) {
            this.accessData.remove(string);
        }
    }

    private DataAccessor getDataAccessor(String string) {
        int n = string.indexOf(58);
        if (n <= 0) {
            return null;
        }
        String string2 = string.substring(0, n);
        Set set = this.accessorRegistry.get(string2);
        if (set.isEmpty()) {
            return null;
        }
        DataAccessorFactory dataAccessorFactory = (DataAccessorFactory)set.iterator().next();
        return dataAccessorFactory.get();
    }

    private void scheduleCachedLinks(String string, int n) {
        if (this.accessData == null) {
            this.logger.error("Internal error: scheduling cached links for unmodified url while no AccessData is set: " + string);
        } else {
            Set set;
            String string2 = this.accessData.get(string, "redirectsTo");
            if (string2 != null) {
                string = string2;
            }
            if ((set = this.accessData.getReferredIDs(string)) != null) {
                for (String string3 : set) {
                    this.schedule(string3, n, true);
                }
            }
        }
    }

    private void processLinks(FileDataObject fileDataObject, int n) {
        InputStream inputStream;
        String string = fileDataObject.getID().toString();
        if (this.accessData != null) {
            this.accessData.removeReferredIDs(string);
        }
        if ((inputStream = this.getMarkSupportingContent(fileDataObject)) == null) {
            return;
        }
        String string2 = this.getMimeType(inputStream, fileDataObject);
        if (string2 == null) {
            return;
        }
        LinkExtractor linkExtractor = this.getLinkExtractor(string2);
        if (linkExtractor == null) {
            return;
        }
        if ((inputStream = this.getByteArrayContent(inputStream, fileDataObject)) == null) {
            return;
        }
        List<String> list = this.getLinks(inputStream, linkExtractor, string);
        if (list == null) {
            return;
        }
        HashSet<String> hashSet = new HashSet<String>(list.size());
        for (String string3 : list) {
            StringUriPair stringUriPair = this.normalizeAndFixURL(string3, fileDataObject.getMetadata().getModel());
            string3 = stringUriPair.string;
            URI uRI = stringUriPair.uri;
            if (string3 == null || string.equals(string3) || hashSet.contains(string3)) continue;
            if (n >= 0) {
                if (string3 != null) {
                    this.schedule(string3, n, true);
                    if (uRI != null) {
                        fileDataObject.getMetadata().add(NIE.links, (Node)uRI);
                        fileDataObject.getMetadata().getModel().addStatement((Resource)uRI, RDF.type, (Node)NIE.DataObject);
                        hashSet.add(string3);
                    }
                } else {
                    this.logger.warn("WebCrawler is skipping link {}", (Object)string3);
                    continue;
                }
            }
            if (this.accessData == null) continue;
            this.accessData.putReferredID(string, string3);
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private List<String> getLinks(InputStream inputStream, LinkExtractor linkExtractor, String string) {
        try {
            inputStream.mark(Integer.MAX_VALUE);
            HashMap<Object, Object> hashMap = new HashMap<Object, Object>();
            hashMap.put(LinkExtractor.BASE_URL_KEY, string);
            if (this.includeEmbeddedResources != null) {
                hashMap.put(LinkExtractor.INCLUDE_EMBEDDED_RESOURCES_KEY, this.includeEmbeddedResources);
            }
            List list = linkExtractor.extractLinks(inputStream, hashMap);
            return list;
        }
        catch (Exception exception) {
            this.logger.info("IOException while extracting links", (Throwable)exception);
        }
        finally {
            try {
                inputStream.reset();
            }
            catch (IOException iOException) {
                this.logger.warn("internal error: IOException while resetting a ByteArrayInputStream", (Throwable)iOException);
            }
        }
        return null;
    }

    private InputStream getMarkSupportingContent(FileDataObject fileDataObject) {
        try {
            InputStream inputStream = null;
            inputStream = fileDataObject.getContent();
            if (!inputStream.markSupported()) {
                inputStream = new BufferedInputStream(inputStream);
            }
            return inputStream;
        }
        catch (IOException iOException) {
            this.logger.info("IOException while obtaining the object content", (Throwable)iOException);
            fileDataObject.setContent(null);
            return null;
        }
    }

    private InputStream getByteArrayContent(InputStream inputStream, FileDataObject fileDataObject) {
        if (!(inputStream instanceof ByteArrayInputStream)) {
            try {
                inputStream = new ByteArrayInputStream(IOUtil.readBytes(inputStream));
            }
            catch (IOException iOException) {
                this.logger.warn("IOException while buffering document", (Throwable)iOException);
                fileDataObject.setContent(null);
                return null;
            }
            fileDataObject.setContent(inputStream);
            return inputStream;
        }
        return inputStream;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private String getMimeType(InputStream inputStream, FileDataObject fileDataObject) {
        String string = null;
        try {
            int n = this.mimeTypeIdentifier.getMinArrayLength();
            inputStream.mark(n);
            try {
                byte[] byArray = IOUtil.readBytes(inputStream, n);
                string = this.mimeTypeIdentifier.identify(byArray, null, fileDataObject.getID());
            }
            finally {
                inputStream.reset();
            }
        }
        catch (IOException iOException) {
            this.logger.debug("IOError while determining the mime type", (Throwable)iOException);
            try {
                inputStream.close();
            }
            catch (Exception exception) {
                // empty catch block
            }
            fileDataObject.setContent(null);
        }
        if (string == null) {
            string = fileDataObject.getMetadata().getString(NIE.mimeType);
        } else {
            fileDataObject.getMetadata().put(NIE.mimeType, string);
        }
        return string;
    }

    private LinkExtractor getLinkExtractor(String string) {
        Set set = this.linkExtractorRegistry.get(string);
        if (!set.isEmpty()) {
            LinkExtractorFactory linkExtractorFactory = (LinkExtractorFactory)set.iterator().next();
            return linkExtractorFactory.get();
        }
        return null;
    }

    private StringUriPair normalizeAndFixURL(String string, Model model) {
        URL uRL;
        String string2 = string;
        if (string.startsWith("file:") || string.startsWith("http:") || string.startsWith("https:")) {
            try {
                String string3;
                uRL = new URL(string);
                string2 = string3 = UrlUtil.normalizeURL(uRL).toExternalForm();
            }
            catch (MalformedURLException malformedURLException) {
                return new StringUriPair(null, null);
            }
        }
        uRL = null;
        try {
            uRL = model != null ? model.createURI(string2) : new URIImpl(string2);
        }
        catch (IllegalArgumentException illegalArgumentException) {
            try {
                if (string2.startsWith("file:") || string2.startsWith("http:") || string2.startsWith("https:")) {
                    try {
                        URL uRL2 = new URL(string2);
                        java.net.URI uRI = new java.net.URI(uRL2.getProtocol(), uRL2.getAuthority(), uRL2.getPath(), uRL2.getQuery(), uRL2.getRef());
                        string2 = uRI.toString();
                        uRL = model.createURI(string2);
                    }
                    catch (MalformedURLException malformedURLException) {
                        string2 = null;
                        uRL = null;
                    }
                    catch (URISyntaxException uRISyntaxException) {
                        string2 = null;
                        uRL = null;
                    }
                } else {
                    string2 = null;
                    uRL = null;
                }
            }
            catch (ModelRuntimeException modelRuntimeException) {
                this.logger.debug("Unable to create URI for link {}", (Object)string2);
                string2 = null;
                uRL = null;
            }
        }
        return new StringUriPair(string2, (URI)uRL);
    }

    private void removeDeprecatedRedirections() {
        if (this.accessData != null) {
            HashSet<Object> hashSet = new HashSet<Object>();
            ClosableIterator closableIterator = this.accessData.getUntouchedIDsIterator();
            while (closableIterator.hasNext()) {
                String string = closableIterator.next().toString();
                if (this.accessData.get(string, "redirectsTo") == null) continue;
                hashSet.add(string);
            }
            for (String string : hashSet) {
                this.accessData.touch(string);
                this.accessData.remove(string, "redirectsTo");
            }
        }
    }

    private void cleanUp() {
        this.domainBoundaries = null;
        this.jobsQueue = null;
        this.jobsMap = null;
        this.crawledUrls = null;
        this.includeEmbeddedResources = null;
    }

    private class WebAccessData
    extends FilterAccessData {
        public WebAccessData(AccessData accessData) {
            super(accessData);
        }

        public void put(String string, String string2, String string3) {
            if ("redirectsTo".equals(string2)) {
                this.touch(string);
                CrawlJob crawlJob = (CrawlJob)WebCrawler.this.jobsMap.remove(string);
                if (crawlJob != null) {
                    WebCrawler.this.jobsQueue.remove(crawlJob);
                }
            }
            super.put(string, string2, string3);
        }
    }

    private static class StringUriPair {
        private String string;
        private URI uri;

        public StringUriPair(String string, URI uRI) {
            this.string = string;
            this.uri = uRI;
        }
    }
}

