/*
 * Copyright (c) 2008 Aduna and Deutsches Forschungszentrum fuer Kuenstliche Intelligenz DFKI GmbH.
 * All rights reserved.
 * 
 * Licensed under the Academic Free License version 3.0.
 */
package org.semanticdesktop.aperture.subcrawler;

import java.io.InputStream;
import java.nio.charset.Charset;

import org.ontoware.rdf2go.model.node.URI;
import org.semanticdesktop.aperture.accessor.AccessData;
import org.semanticdesktop.aperture.datasource.DataSource;
import org.semanticdesktop.aperture.rdf.RDFContainer;

/**
 * A SubCrawler accesses an InputStream and produces a stream of other DataObjects representing the resources
 * found "inside".
 * 
 * <p>
 * An AccessData instance can optionally be specified to a SubCrawler, allowing it to perform incremental
 * crawling, i.e. to scan and report the differences in the stream since the last crawl.
 */
public interface SubCrawler {

    /**
     * Starts crawling the given stream and to report the encountered DataObjects to the given
     * SubCrawlerHandler. If an AccessData instance is passed, it is used to check if the data objects are to
     * be reported as new, modified, or unmodified. Note that the SubCrawler will not report deleted objects.
     * 
     * @param id the URI identifying the object (e.g. a file or web page) from which the stream was obtained.
     *            This URI is treated as the URI of the parent object, all objects encountered in the stream
     *            are considered to be contained within the parent object. (optional, the implementation may
     *            use this uri or the one returned from the {@link RDFContainer#getDescribedUri()} method of 
     *            the parentMetadata)
     * @param stream the stream to be crawled. (obligatory)
     * @param accessData the AccessData used to determine if the encountered objects are to be returned as
     *            new, modified, unmodified or deleted. Information about new or modified objects is stored
     *            within for use in future crawls. This parameter may be null if this functionality is not
     *            desired, in which case all DataObjects will be reported as new. (optional)
     * @param handler The crawler handler that is to receive the notifications from the SubCrawler
     *            (obligatory)
     * @param charset the charset in which the inputstream is encoded (optional).
     * @param mimeType the MIME type of the passed stream (optional).
     * @param parentMetadata The 'parent' RDFContainer, that will contain the metadata about the top-level
     *            entity in the stream. A SubCrawler may (in some cases) limit itself to augmenting the
     *            metadata in this RDFContainer without delivering any additional DataObjects. (obligatory)
     * @throws SubCrawlerException if any of the obligatory parameters is null or if any error during the 
     *            crawling process occured
     */
    public void subCrawl(URI id, InputStream stream, SubCrawlerHandler handler, DataSource dataSource,
            AccessData accessData, Charset charset, String mimeType, RDFContainer parentMetadata) throws SubCrawlerException;

    /**
     * Stops a running crawl as fast as possible. This method may return before the crawling has actually
     * stopped.
     */
    public void stopSubCrawler();
}
