package org.jboss.elasticsearch.river.remote;

import java.io.ByteArrayInputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.SettingsException;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.jboss.elasticsearch.river.remote.HttpRemoteSystemClientBase;
import org.jboss.elasticsearch.river.remote.exception.RemoteDocumentNotFoundException;
import org.jboss.elasticsearch.river.remote.sitemap.AbstractSiteMap;
import org.jboss.elasticsearch.river.remote.sitemap.SiteMap;
import org.jboss.elasticsearch.river.remote.sitemap.SiteMapParser;
import org.jboss.elasticsearch.river.remote.sitemap.SiteMapURL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

/* loaded from: input_file:org/jboss/elasticsearch/river/remote/GetSitemapHtmlClient.class */
public class GetSitemapHtmlClient extends HttpRemoteSystemClientBase {
    protected static final String CFG_HM_STRIP_HTML = "stripHtml";
    protected static final String CFG_HM_CSS_SELECTOR = "cssSelector";
    protected static final String CFG_HM_VALUE_ATTRIBUTE = "valueAttribute";
    protected static final String CFG_URL_GET_SITEMAP = "urlGetSitemap";
    protected static final String CFG_HTML_MAPPING = "htmlMapping";
    public static final String DOC_FIELD_ID = "id";
    public static final String DOC_FIELD_URL = "url";
    public static final String DOC_FIELD_LAST_MODIFIED = "last_modified";
    public static final String DOC_FIELD_PRIORITY = "priority";
    protected String urlGetSitemap;
    protected Map<String, Map<String, Object>> htmlMapping;
    protected SiteMapParser sitemapParser = new SiteMapParser();
    private static final ESLogger logger = Loggers.getLogger(GetSitemapHtmlClient.class);
    private static final Set<String> IGNORED_EXTENSIONS = new HashSet();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/jboss/elasticsearch/river/remote/GetSitemapHtmlClient$ToTextNodeVisitor.class */
    public static final class ToTextNodeVisitor implements NodeVisitor {
        final StringBuilder buffer;

        ToTextNodeVisitor(StringBuilder sb) {
            this.buffer = sb;
        }

        public void head(Node node, int i) {
            if (node instanceof TextNode) {
                String trim = ((TextNode) node).text().replace((char) 160, ' ').trim();
                if (trim.isEmpty()) {
                    return;
                }
                this.buffer.append(trim);
                if (trim.endsWith(" ")) {
                    return;
                }
                this.buffer.append(" ");
            }
        }

        public void tail(Node node, int i) {
        }
    }

    @Override // org.jboss.elasticsearch.river.remote.IRemoteSystemClient
    public void init(Map<String, Object> map, boolean z, IPwdLoader iPwdLoader) {
        this.urlGetSitemap = getUrlFromConfig(map, CFG_URL_GET_SITEMAP, true);
        try {
            this.htmlMapping = (Map) map.get(CFG_HTML_MAPPING);
            if (z) {
                throw new SettingsException("Dynamic Spaces obtaining is not supported, use 'remote/spacesIndexed' to configure one space or static list");
            }
            String initHttpClient = initHttpClient(logger, map, iPwdLoader, this.urlGetSitemap);
            ESLogger eSLogger = logger;
            Object[] objArr = new Object[2];
            objArr[0] = this.urlGetSitemap;
            objArr[1] = initHttpClient != null ? initHttpClient : "Anonymous access";
            eSLogger.info("Configured sitemap.xml HTML client for URL '{}', remote system user '{}'.", objArr);
        } catch (ClassCastException e) {
            throw new SettingsException("'remote/htmlMapping' configuration section is invalid");
        }
    }

    @Override // org.jboss.elasticsearch.river.remote.IRemoteSystemClient
    public List<String> getAllSpaces() throws Exception {
        throw new UnsupportedOperationException("Dynamic Spaces obtaining is not supported, use 'remote/spacesIndexed' to configure one space or static list");
    }

    @Override // org.jboss.elasticsearch.river.remote.IRemoteSystemClient
    public ChangedDocumentsResults getChangedDocuments(String str, int i, Date date) throws Exception {
        HttpRemoteSystemClientBase.HttpResponseContent performHttpGetCall = performHttpGetCall(this.urlGetSitemap, null);
        logger.debug("HTTP GET sitemap response data: {}", new Object[]{performHttpGetCall});
        List<Map<String, Object>> processSitemap = processSitemap(performHttpGetCall, this.urlGetSitemap);
        return new ChangedDocumentsResults(processSitemap, 0, Integer.valueOf(processSitemap.size()));
    }

    protected List<Map<String, Object>> processSitemap(HttpRemoteSystemClientBase.HttpResponseContent httpResponseContent, String str) throws Exception {
        AbstractSiteMap parseSiteMap = this.sitemapParser.parseSiteMap(httpResponseContent.contentType, httpResponseContent.content, new URL(str));
        if (parseSiteMap.isIndex()) {
            throw new Exception("Sitemap index format is not supported by this river!");
        }
        SiteMap siteMap = (SiteMap) parseSiteMap;
        ArrayList arrayList = new ArrayList();
        for (SiteMapURL siteMapURL : siteMap.getSiteMapUrls()) {
            String externalForm = siteMapURL.getUrl().toExternalForm();
            String fileExtensionLowercase = Utils.getFileExtensionLowercase(externalForm);
            if (fileExtensionLowercase == null || !IGNORED_EXTENSIONS.contains(fileExtensionLowercase)) {
                HashMap hashMap = new HashMap();
                hashMap.put(DOC_FIELD_ID, createIdFromUrl(externalForm));
                hashMap.put(DOC_FIELD_URL, externalForm);
                hashMap.put(DOC_FIELD_LAST_MODIFIED, DateTimeUtils.formatISODateTime(siteMapURL.getLastModified()));
                hashMap.put(DOC_FIELD_PRIORITY, new Double(siteMapURL.getPriority()));
                arrayList.add(hashMap);
            } else {
                logger.debug("Ignored URL as it contains ignored file extension: " + externalForm, new Object[0]);
            }
        }
        return arrayList;
    }

    protected static String createIdFromUrl(String str) {
        if (str == null) {
            return null;
        }
        return str.replace("://", "_").replace(":", "_").replace(".", "_").replace("=", "_").replace("\\", "_").replace("/", "_").replace("?", "_").replace("&", "_").replace("%", "_").replace("*", "_").replace("$", "_").replace("#", "_").replace("@", "_").replace("+", "_").replace("<", "_").replace(">", "_");
    }

    @Override // org.jboss.elasticsearch.river.remote.IRemoteSystemClient
    public Object getChangedDocumentDetails(String str, String str2, Map<String, Object> map) throws Exception, RemoteDocumentNotFoundException {
        try {
            String str3 = (String) map.get(DOC_FIELD_URL);
            if (str3 == null) {
                return null;
            }
            HttpRemoteSystemClientBase.HttpResponseContent performHttpGetCall = performHttpGetCall(str3, null);
            if (performHttpGetCall.contentType == null || !performHttpGetCall.contentType.contains("text/html")) {
                throw new RemoteDocumentNotFoundException("HTML document can't be processed as it is not html but: " + performHttpGetCall.contentType);
            }
            try {
                Document parse = Jsoup.parse(new ByteArrayInputStream(performHttpGetCall.content), (String) null, str3);
                if (this.htmlMapping == null) {
                    return parse.html();
                }
                HashMap hashMap = new HashMap();
                for (String str4 : this.htmlMapping.keySet()) {
                    String str5 = null;
                    Map<String, Object> map2 = this.htmlMapping.get(str4);
                    String trimToNull = Utils.trimToNull((String) map2.get(CFG_HM_CSS_SELECTOR));
                    boolean nodeBooleanValue = XContentMapValues.nodeBooleanValue(map2.get(CFG_HM_STRIP_HTML), false);
                    if (trimToNull != null) {
                        Elements select = parse.select(trimToNull);
                        if (select != null && !select.isEmpty()) {
                            String trimToNull2 = Utils.trimToNull((String) map2.get(CFG_HM_VALUE_ATTRIBUTE));
                            if (trimToNull2 != null) {
                                StringBuilder sb = new StringBuilder();
                                Iterator it = select.iterator();
                                while (it.hasNext()) {
                                    String trimToNull3 = Utils.trimToNull(((Element) it.next()).attr(trimToNull2));
                                    if (trimToNull3 != null) {
                                        if (sb.length() > 0) {
                                            sb.append(" ");
                                        }
                                        sb.append(trimToNull3);
                                    }
                                }
                                str5 = Utils.trimToNull(sb.toString());
                            } else {
                                str5 = nodeBooleanValue ? convertElementsToText(select) : select.size() == 1 ? select.html() : select.outerHtml();
                            }
                        }
                    } else {
                        str5 = nodeBooleanValue ? convertNodeToText(parse) : parse.html();
                    }
                    hashMap.put(str4, str5);
                }
                return hashMap;
            } catch (ClassCastException e) {
                throw new SettingsException("'remote/htmlMapping' configuration section is invalid");
            } catch (Exception e2) {
                throw new RemoteDocumentNotFoundException("HTML document can't be processed: " + e2.getMessage(), e2);
            }
        } catch (URISyntaxException e3) {
            throw new RemoteDocumentNotFoundException("URL of sitemap is invalid: " + e3.getMessage(), e3);
        } catch (HttpRemoteSystemClientBase.HttpCallException e4) {
            if (e4.getStatusCode() == 404) {
                throw new RemoteDocumentNotFoundException(e4);
            }
            throw e4;
        }
    }

    protected static String convertNodeToText(Node node) {
        if (node == null) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        new NodeTraversor(new ToTextNodeVisitor(sb)).traverse(node);
        return sb.toString().trim();
    }

    protected static String convertElementsToText(Elements elements) {
        if (elements == null || elements.isEmpty()) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        NodeTraversor nodeTraversor = new NodeTraversor(new ToTextNodeVisitor(sb));
        Iterator it = elements.iterator();
        while (it.hasNext()) {
            nodeTraversor.traverse((Element) it.next());
        }
        return sb.toString().trim();
    }

    static {
        IGNORED_EXTENSIONS.add("txt");
        IGNORED_EXTENSIONS.add("jpg");
        IGNORED_EXTENSIONS.add("jpeg");
        IGNORED_EXTENSIONS.add("tiff");
        IGNORED_EXTENSIONS.add("gif");
        IGNORED_EXTENSIONS.add("json");
        IGNORED_EXTENSIONS.add("otf");
        IGNORED_EXTENSIONS.add("eot");
        IGNORED_EXTENSIONS.add("svg");
        IGNORED_EXTENSIONS.add("ttf");
        IGNORED_EXTENSIONS.add("woff");
        IGNORED_EXTENSIONS.add("gz");
        IGNORED_EXTENSIONS.add("zip");
        IGNORED_EXTENSIONS.add("exe");
        IGNORED_EXTENSIONS.add("rar");
    }
}
