package ilarkesto.net;

import ilarkesto.base.Str;
import ilarkesto.core.base.Utl;
import ilarkesto.core.logging.Log;
import ilarkesto.io.IO;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

/* loaded from: input_file:ilarkesto/net/WebCrawler.class */
public class WebCrawler {
    private static Log log = Log.get(WebCrawler.class);
    private Filter filter;
    private Consumer consumer;
    private String defaultEncoding = "UTF-8";
    private Set<String> crawledUrls = new HashSet();

    /* loaded from: input_file:ilarkesto/net/WebCrawler$Consumer.class */
    public interface Consumer {
        void onConnected(String str, URLConnection uRLConnection);

        void onNotFound(String str);

        boolean skipNonHtml(String str);

        void onError(Exception exc, String str, URLConnection uRLConnection) throws Exception;

        void onUnknown(String str, URLConnection uRLConnection);

        void onHtml(String str, String str2);
    }

    /* loaded from: input_file:ilarkesto/net/WebCrawler$DownloadConsumer.class */
    public static class DownloadConsumer implements Consumer {
        private String destinationDir;
        private boolean skipNonHtml;
        private String encoding;

        public DownloadConsumer(String str, String str2) {
            this.encoding = str2;
            this.destinationDir = str;
        }

        @Override // ilarkesto.net.WebCrawler.Consumer
        public void onHtml(String str, String str2) {
            File file = getFile(str);
            WebCrawler.log.info("Storing:", file);
            if (this.encoding == null) {
                this.encoding = Str.getCharsetFromHtml(str2, "UTF-8");
            }
            IO.writeFile(file, str2, this.encoding);
        }

        @Override // ilarkesto.net.WebCrawler.Consumer
        public void onUnknown(String str, URLConnection uRLConnection) {
            File file = getFile(str);
            WebCrawler.log.info("Storing:", file);
            IO.downloadToFile(uRLConnection, file);
        }

        private File getFile(String str) {
            try {
                URL url = new URL(str);
                String path = url.getPath();
                if (Str.isBlank(path)) {
                    path = "/";
                }
                if (path.endsWith("/")) {
                    path = "_.html";
                }
                return new File(this.destinationDir + "/" + url.getHost() + "/" + path);
            } catch (MalformedURLException e) {
                throw new RuntimeException(e);
            }
        }

        @Override // ilarkesto.net.WebCrawler.Consumer
        public boolean skipNonHtml(String str) {
            return this.skipNonHtml;
        }

        @Override // ilarkesto.net.WebCrawler.Consumer
        public void onNotFound(String str) {
        }

        @Override // ilarkesto.net.WebCrawler.Consumer
        public void onError(Exception exc, String str, URLConnection uRLConnection) throws Exception {
            throw exc;
        }

        @Override // ilarkesto.net.WebCrawler.Consumer
        public void onConnected(String str, URLConnection uRLConnection) {
        }
    }

    /* loaded from: input_file:ilarkesto/net/WebCrawler$Filter.class */
    public interface Filter {
        boolean accept(String str);
    }

    /* loaded from: input_file:ilarkesto/net/WebCrawler$HostFilter.class */
    public static class HostFilter implements Filter {
        private String host;

        public HostFilter(String str) {
            try {
                URL url = new URL(str);
                this.host = url.getProtocol() + "://" + url.getHost();
            } catch (MalformedURLException e) {
                throw new RuntimeException(e);
            }
        }

        @Override // ilarkesto.net.WebCrawler.Filter
        public boolean accept(String str) {
            return str.startsWith(this.host);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:ilarkesto/net/WebCrawler$HtmlParser.class */
    public class HtmlParser {
        private String html;

        public HtmlParser(String str) {
            this.html = str;
        }

        public String nextUrl() {
            int indexOf = Str.indexOf(this.html, new String[]{"href=", "src="}, 0);
            if (indexOf < 0) {
                return null;
            }
            this.html = this.html.substring(indexOf);
            if (this.html.startsWith("href=")) {
                this.html = this.html.substring(5);
                return nextAttributeValue();
            }
            if (!this.html.startsWith("src=")) {
                throw new IllegalStateException(this.html);
            }
            this.html = this.html.substring(4);
            return nextAttributeValue();
        }

        private String nextAttributeValue() {
            int indexOf = Str.indexOf(this.html, new String[]{"\"", "'"}, 0);
            if (indexOf < 0) {
                return nextUrl();
            }
            char charAt = this.html.charAt(indexOf);
            this.html = this.html.substring(indexOf + 1);
            int indexOf2 = this.html.indexOf(charAt);
            if (indexOf2 < 0) {
                return nextUrl();
            }
            String substring = this.html.substring(0, indexOf2);
            this.html = this.html.substring(indexOf2 + 1);
            return substring.contains("\" + gaJsHost + \"") ? nextUrl() : substring;
        }
    }

    public static void download(String str, String str2) {
        WebCrawler webCrawler = new WebCrawler();
        webCrawler.activateDownloading(str2);
        webCrawler.crawl(str);
    }

    public void crawl(String str) {
        if (this.filter == null) {
            this.filter = new HostFilter(str);
        }
        crawl(Utl.toList(str));
    }

    private void crawl(Collection<String> collection) {
        HashSet<String> hashSet = new HashSet();
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            Set<String> doCrawl = doCrawl(it.next());
            if (!doCrawl.isEmpty()) {
                log.debug("  parsed", Integer.valueOf(doCrawl.size()), "URLs");
            }
            hashSet.addAll(doCrawl);
        }
        ArrayList arrayList = new ArrayList();
        for (String str : hashSet) {
            if (!this.crawledUrls.contains(str)) {
                if (this.filter == null || this.filter.accept(str)) {
                    arrayList.add(str);
                } else {
                    log.debug("  filtered out:", str);
                }
            }
        }
        if (arrayList.isEmpty()) {
            return;
        }
        crawl(arrayList);
    }

    private Set<String> doCrawl(String str) {
        log.debug("Crawling:", str);
        this.crawledUrls.add(str);
        if (!isProbablyHtml(str) && (this.consumer == null || this.consumer.skipNonHtml(str))) {
            return Collections.emptySet();
        }
        URLConnection openUrlConnection = IO.openUrlConnection(str, null, null);
        try {
            openUrlConnection.connect();
            if (this.consumer != null) {
                this.consumer.onConnected(str, openUrlConnection);
            }
            String contentType = openUrlConnection.getContentType();
            if (Str.isBlank(contentType)) {
                contentType = "application/unknown";
            }
            if (!contentType.startsWith("text/html")) {
                if (this.consumer != null) {
                    this.consumer.onUnknown(str, openUrlConnection);
                }
                return Collections.emptySet();
            }
            String contentEncoding = openUrlConnection.getContentEncoding();
            if (Str.isBlank(contentEncoding)) {
                contentEncoding = this.defaultEncoding;
            }
            try {
                byte[] readToByteArray = IO.readToByteArray(openUrlConnection.getInputStream());
                try {
                    String str2 = new String(readToByteArray, contentEncoding);
                    String charsetFromHtml = Str.getCharsetFromHtml(str2, contentEncoding);
                    if (!contentEncoding.equals(charsetFromHtml)) {
                        try {
                            str2 = new String(readToByteArray, charsetFromHtml);
                        } catch (UnsupportedEncodingException e) {
                        }
                    }
                    if (this.consumer != null) {
                        this.consumer.onHtml(str, str2);
                    }
                    return parseUrls(str2, str);
                } catch (UnsupportedEncodingException e2) {
                    throw new RuntimeException("Loading URL failed: " + str, e2);
                }
            } catch (FileNotFoundException e3) {
                log.debug("  not found:", str);
                if (this.consumer != null) {
                    this.consumer.onNotFound(str);
                }
                return Collections.emptySet();
            } catch (IOException e4) {
                throw new RuntimeException("Loading URL failed: " + str, e4);
            }
        } catch (Exception e5) {
            if (this.consumer == null) {
                throw new RuntimeException(e5);
            }
            try {
                this.consumer.onError(e5, str, openUrlConnection);
                return Collections.emptySet();
            } catch (Exception e6) {
                throw new RuntimeException(e6);
            }
        }
    }

    static boolean isProbablyHtml(String str) {
        try {
            String lowerCase = new URL(str).getPath().toLowerCase();
            int lastIndexOf = lowerCase.lastIndexOf(47);
            if (lastIndexOf > 0) {
                lowerCase = lowerCase.substring(lastIndexOf);
            }
            return !lowerCase.contains(".") || lowerCase.endsWith(".html") || lowerCase.endsWith(".htm") || lowerCase.endsWith(".jsp") || lowerCase.endsWith(".php");
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }
    }

    static String normalizeUrl(String str) {
        int indexOf = str.indexOf(35);
        if (indexOf >= 0) {
            str = str.substring(0, indexOf);
        }
        String replace = str.replace("/./", "/");
        while (true) {
            String str2 = replace;
            if (!str2.contains("/../")) {
                return str2;
            }
            int indexOf2 = str2.indexOf("/../");
            int i = indexOf2 - 1;
            while (str2.charAt(i) != '/') {
                i--;
            }
            replace = str2.substring(0, i) + str2.substring(indexOf2 + 3);
        }
    }

    private Set<String> parseUrls(String str, String str2) {
        HashSet hashSet = new HashSet();
        HtmlParser htmlParser = new HtmlParser(str);
        while (true) {
            String nextUrl = htmlParser.nextUrl();
            if (nextUrl == null) {
                return hashSet;
            }
            String normalizeUrl = normalizeUrl(nextUrl);
            if (!Str.isBlank(normalizeUrl)) {
                hashSet.add(normalizeUrl(concatUrlWithRelative(str2, normalizeUrl)));
            }
        }
    }

    static String concatUrlWithRelative(String str, String str2) {
        if (str2.startsWith("http://") || str2.startsWith("https://")) {
            return str2;
        }
        return getBaseUrl(str) + str2;
    }

    static String getBaseUrl(String str) {
        int i = 7;
        if (str.startsWith("https://")) {
            i = 7 + 1;
        }
        int lastIndexOf = str.lastIndexOf(47);
        if (lastIndexOf > i) {
            str = str.substring(0, lastIndexOf);
        }
        if (!str.endsWith("/")) {
            str = str + '/';
        }
        return str;
    }

    public void setFilter(Filter filter) {
        this.filter = filter;
    }

    public void setDefaultEncoding(String str) {
        this.defaultEncoding = str;
    }

    public void setConsumer(Consumer consumer) {
        this.consumer = consumer;
    }

    public void activateDownloading(String str) {
        setConsumer(new DownloadConsumer(str, "UTF-8"));
    }

    public Set<String> getCrawledUrls() {
        return this.crawledUrls;
    }
}
