package org.apache.stanbol.enhancer.engines.htmlextractor.impl;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.abdera.util.Constants;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;

/* loaded from: input_file:WEB-INF/lib/org.apache.stanbol.enhancer.engines.htmlextractor-0.10.0.jar:org/apache/stanbol/enhancer/engines/htmlextractor/impl/HtmlExtractor.class */
public class HtmlExtractor {
    private static final Logger LOG = LoggerFactory.getLogger(HtmlExtractor.class);
    public static String DEFAULT_CONFIGURATION = "htmlextractors.xml";
    private HtmlParser htmlParser;
    public HtmlExtractionRegistry registry;

    public HtmlExtractor() {
        this.registry = null;
        if (this.registry == null) {
            try {
                this.htmlParser = new HtmlParser();
                this.registry = new HtmlExtractionRegistry(DEFAULT_CONFIGURATION);
            } catch (InitializationException e) {
                LOG.error("Registry Initialization Error: " + e.getMessage());
            }
        }
    }

    public HtmlExtractor(HtmlExtractionRegistry htmlExtractionRegistry, HtmlParser htmlParser) {
        this.registry = null;
        this.registry = htmlExtractionRegistry;
        this.htmlParser = htmlParser;
    }

    public HtmlExtractor(String str) throws InitializationException {
        this.registry = null;
        this.htmlParser = new HtmlParser();
        this.registry = new HtmlExtractionRegistry(str);
    }

    public void extract(String str, InputStream inputStream, Charset charset, String str2, MGraph mGraph) throws ExtractorException {
        String name;
        if (this.registry == null) {
            return;
        }
        if (charset == null) {
            if (!inputStream.markSupported()) {
                inputStream = new BufferedInputStream(inputStream);
            }
            try {
                name = CharsetRecognizer.detect(inputStream, Constants.HTML, "UTF-8");
            } catch (IOException e) {
                LOG.error("Charset detection problem: " + e.getMessage());
                throw new ExtractorException("Charset detection problem: " + e.getMessage());
            }
        } else {
            name = charset.name();
        }
        Document dom = this.htmlParser.getDOM(inputStream, name);
        HashMap<String, HtmlExtractionComponent> registry = this.registry.getRegistry();
        new ArrayList();
        long size = mGraph.size();
        for (String str3 : this.registry.getActiveExtractors()) {
            LOG.debug("Extractor: {}", str3);
            HtmlExtractionComponent htmlExtractionComponent = registry.get(str3);
            if (htmlExtractionComponent != null) {
                htmlExtractionComponent.extract(str, dom, null, mGraph);
                long size2 = mGraph.size();
                if (size < size2) {
                    LOG.debug("{} Statements added: {}", Long.valueOf(size2 - size), str3);
                    size = size2;
                }
            }
        }
    }

    public static void main(String[] strArr) throws Exception {
        HtmlExtractor htmlExtractor = new HtmlExtractor();
        for (int i = 0; i < strArr.length; i++) {
            File file = new File(strArr[i]);
            htmlExtractor.extract(new UriRef(file.toURI().toString()).getUnicodeString(), new FileInputStream(file), Charset.forName("UTF-8"), "text/html", new SimpleMGraph());
            System.out.println("Model for " + strArr[i]);
            System.out.println();
        }
    }
}
