package cn.edu.hfut.dmic.webcollector.example;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;

/* loaded from: input_file:cn/edu/hfut/dmic/webcollector/example/DemoManualNewsCrawler.class */
public class DemoManualNewsCrawler extends BreadthCrawler {
    public DemoManualNewsCrawler(String str, boolean z) {
        super(str, z);
        addSeedAndReturn("https://blog.github.com/").type("list");
        for (int i = 2; i <= 5; i++) {
            addSeed(String.format("https://blog.github.com/page/%d/", Integer.valueOf(i)), "list");
        }
        setThreads(50);
        getConf().setTopN(100);
    }

    @Override // cn.edu.hfut.dmic.webcollector.fetcher.Visitor
    public void visit(Page page, CrawlDatums crawlDatums) {
        String url = page.url();
        if (page.matchType("list")) {
            crawlDatums.add(page.links("h1.lh-condensed>a")).type("content");
            return;
        }
        if (page.matchType("content")) {
            String text = page.select("h1[class=lh-condensed]").first().text();
            String selectText = page.selectText("div.content.markdown-body");
            String str = getConf().getString("title_prefix") + text;
            String substring = selectText.substring(0, getConf().getInteger("content_length_limit").intValue());
            System.out.println("URL:\n" + url);
            System.out.println("title:\n" + str);
            System.out.println("content:\n" + substring);
        }
    }

    public static void main(String[] strArr) throws Exception {
        DemoManualNewsCrawler demoManualNewsCrawler = new DemoManualNewsCrawler("crawl", false);
        demoManualNewsCrawler.getConf().setExecuteInterval(5000);
        demoManualNewsCrawler.getConf().set("title_prefix", "PREFIX_");
        demoManualNewsCrawler.getConf().set("content_length_limit", 20);
        demoManualNewsCrawler.start(4);
    }
}
