/*
 * Decompiled with CFR 0.152.
 */
package org.apache.stanbol.enhancer.engines.htmlextractor;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.BundleURIResolver;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ClerezzaRDFUtils;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.BundleContext;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class HtmlExtractorEngine
extends AbstractEnhancementEngine<IOException, RuntimeException>
implements EnhancementEngine,
ServiceProperties {
    private static final Logger LOG = LoggerFactory.getLogger(HtmlExtractorEngine.class);
    private static final Charset UTF8 = Charset.forName("UTF-8");
    public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
    private static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml";
    public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.htmlextractor.htmlextractors";
    private ContentItemFactory ciFactory;
    BundleContext bundleContext;
    private Set<String> supportedMimeTypes = new HashSet<String>(Arrays.asList("text/html", "application/xhtml+xml"));
    private HtmlExtractionRegistry htmlExtractorRegistry;
    private HtmlParser htmlParser;
    private boolean singleRootRdf = true;
    private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";

    protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
        super.activate(ce);
        this.bundleContext = ce.getBundleContext();
        BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
        String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
        Dictionary properties = ce.getProperties();
        String confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY);
        if (confFile != null && confFile.trim().length() > 0) {
            htmlExtractors = confFile;
        }
        try {
            this.htmlExtractorRegistry = new HtmlExtractionRegistry(htmlExtractors);
        }
        catch (InitializationException e) {
            LOG.error("Registry Initialization Error: " + e.getMessage());
            throw new IOException(e.getMessage());
        }
        this.htmlParser = new HtmlParser();
    }

    protected void deactivate(ComponentContext ce) {
        super.deactivate(ce);
        this.htmlParser = null;
        this.htmlExtractorRegistry = null;
    }

    public Map<String, Object> getServiceProperties() {
        return Collections.unmodifiableMap(Collections.singletonMap("org.apache.stanbol.enhancer.engine.order", defaultOrder));
    }

    public int canEnhance(ContentItem ci) throws EngineException {
        LOG.info("MimeType: {}", (Object)ci.getMimeType());
        if (this.isSupported(ci.getMimeType())) {
            return 2;
        }
        return 0;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void computeEnhancements(ContentItem ci) throws EngineException {
        HtmlExtractor extractor = new HtmlExtractor(this.htmlExtractorRegistry, this.htmlParser);
        SimpleMGraph model = new SimpleMGraph();
        ci.getLock().readLock().lock();
        try {
            extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), (MGraph)model);
        }
        catch (ExtractorException e) {
            throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", (Throwable)e);
        }
        finally {
            ci.getLock().readLock().unlock();
        }
        ClerezzaRDFUtils.urifyBlankNodes((MGraph)model);
        if (this.singleRootRdf) {
            ClerezzaRDFUtils.makeConnected((MGraph)model, (NonLiteral)ci.getUri(), new UriRef("http://www.semanticdesktop.org/ontologies/2007/01/19/nie#contains"));
        }
        ci.getLock().writeLock().lock();
        try {
            LOG.info("Model: {}", (Object)model);
            ci.getMetadata().addAll((Collection)model);
            model = null;
        }
        finally {
            ci.getLock().writeLock().unlock();
        }
    }

    private boolean isSupported(String mimeType) {
        return this.supportedMimeTypes.contains(mimeType);
    }

    protected void bindCiFactory(ContentItemFactory contentItemFactory) {
        this.ciFactory = contentItemFactory;
    }

    protected void unbindCiFactory(ContentItemFactory contentItemFactory) {
        if (this.ciFactory == contentItemFactory) {
            this.ciFactory = null;
        }
    }
}

