package org.semanticdesktop.aperture.extractor.util;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.lexer.InputStreamSource;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.ParserFeedback;
import org.htmlparser.util.Translate;
import org.htmlparser.visitors.NodeVisitor;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:aperture-1.1.0.Beta1.jar:org/semanticdesktop/aperture/extractor/util/HtmlParserUtil.class */
public class HtmlParserUtil {
    private static final int BUFFER_SIZE = InputStreamSource.BUFFER_SIZE;
    private static final ParserFeedback FEEDBACK_LOGGER = new ParserFeedback() { // from class: org.semanticdesktop.aperture.extractor.util.HtmlParserUtil.1
        private final Logger logger = LoggerFactory.getLogger(getClass());

        public void info(String str) {
            this.logger.info(str);
        }

        public void warning(String str) {
            this.logger.warn(str);
        }

        public void error(String str, ParserException parserException) {
            this.logger.error(str, parserException);
        }
    };

    /* loaded from: input_file:aperture-1.1.0.Beta1.jar:org/semanticdesktop/aperture/extractor/util/HtmlParserUtil$ContentExtractor.class */
    public static class ContentExtractor extends NodeVisitor {
        private static final String XMP = "XMP";
        private static final String PLAINTEXT = "PLAINTEXT";
        private static final String STYLE = "STYLE";
        private static final String SCRIPT = "SCRIPT";
        private static final String TITLE = "TITLE";
        private boolean inTextContext;
        private boolean inTitleContext;
        private boolean decodeText;
        private StringBuilder textBuffer = new StringBuilder(32768);
        private HashSet keywordBuffer = new HashSet();
        private String title;
        private String author;
        private String description;

        public ContentExtractor() {
            initFlags();
        }

        private void initFlags() {
            this.inTextContext = true;
            this.inTitleContext = false;
            this.decodeText = true;
        }

        public void reset() {
            initFlags();
            this.textBuffer.setLength(0);
            this.keywordBuffer.clear();
            this.title = null;
            this.author = null;
            this.description = null;
        }

        public String getText() {
            return this.textBuffer.toString();
        }

        public Iterator getKeywords() {
            return this.keywordBuffer.iterator();
        }

        public String getTitle() {
            return this.title;
        }

        public String getAuthor() {
            return this.author;
        }

        public String getDescription() {
            return this.description;
        }

        public void visitStringNode(Text text) {
            if (this.inTitleContext) {
                this.title = resolveText(text.getText());
                if (this.title != null) {
                    this.title = this.title.trim();
                }
            }
            if (this.inTextContext) {
                String text2 = text.getText();
                if (this.decodeText) {
                    text2 = resolveText(text2);
                }
                this.textBuffer.append(text2);
                this.textBuffer.append(' ');
            }
        }

        private String resolveText(String str) {
            return Translate.decode(str).replace((char) 160, ' ');
        }

        public void visitTag(Tag tag) {
            String tagName = tag.getTagName();
            if (STYLE.equals(tagName) || SCRIPT.equals(tagName)) {
                this.inTextContext = false;
                return;
            }
            this.inTextContext = true;
            this.inTitleContext = TITLE.equals(tagName);
            if (!(tag instanceof MetaTag)) {
                if (XMP.equals(tagName) || PLAINTEXT.equals(tagName)) {
                    this.decodeText = false;
                    return;
                }
                return;
            }
            MetaTag metaTag = (MetaTag) tag;
            String metaTagName = metaTag.getMetaTagName();
            String metaContent = metaTag.getMetaContent();
            if (metaTagName == null || metaContent == null) {
                return;
            }
            String lowerCase = metaTagName.toLowerCase();
            if (lowerCase.equals("author")) {
                this.author = metaContent;
                return;
            }
            if (lowerCase.equals("description")) {
                this.description = metaContent;
                return;
            }
            if (lowerCase.equals("keywords")) {
                StringTokenizer stringTokenizer = new StringTokenizer(metaContent, " ,\t", false);
                while (stringTokenizer.hasMoreTokens()) {
                    String nextToken = stringTokenizer.nextToken();
                    if (nextToken != null) {
                        this.keywordBuffer.add(nextToken);
                    }
                }
            }
        }

        public void visitEndTag(Tag tag) {
            this.inTitleContext = false;
            String tagName = tag.getTagName();
            if (XMP.equals(tagName) || PLAINTEXT.equals(tagName)) {
                this.decodeText = true;
            }
        }
    }

    public static void parse(InputStream inputStream, Charset charset, ContentExtractor contentExtractor) throws ExtractorException {
        String displayName = charset == null ? "ISO-8859-1" : charset.displayName();
        if (!inputStream.markSupported()) {
            inputStream = new BufferedInputStream(inputStream, BUFFER_SIZE);
        }
        inputStream.mark(BUFFER_SIZE);
        try {
            Parser parser = new Parser(new Lexer(new Page(new InputStreamSource(inputStream, displayName, BUFFER_SIZE))), FEEDBACK_LOGGER);
            try {
                try {
                    parser.visitAllNodesWith(contentExtractor);
                } catch (ParserException e) {
                    throw new ExtractorException((Throwable) e);
                }
            } catch (EncodingChangeException e2) {
                parser.reset();
                contentExtractor.reset();
                parser.visitAllNodesWith(contentExtractor);
            }
        } catch (UnsupportedEncodingException e3) {
            throw new ExtractorException(e3);
        }
    }
}
