/*
 * Decompiled with CFR 0.152.
 */
package com.chinamcloud.haihe.common.spider.processor;

import com.alibaba.fastjson.JSON;
import com.chinamcloud.haihe.backStageManagement.pojo.CrawlerOperation;
import com.chinamcloud.haihe.backStageManagement.pojo.Site;
import com.chinamcloud.haihe.backStageManagement.pojo.SiteInc;
import com.chinamcloud.haihe.backStageManagement.pojo.SpiderTemplate;
import com.chinamcloud.haihe.backStageManagement.service.SiteManageService;
import com.chinamcloud.haihe.common.Exception.CustomException;
import com.chinamcloud.haihe.common.i18n.MessageSource;
import com.chinamcloud.haihe.common.spider.NewsInfoExtractor;
import com.chinamcloud.haihe.common.spider.match.WebMatch;
import com.chinamcloud.haihe.common.spider.pojo.News;
import com.chinamcloud.haihe.common.spider.processor.BaseProcessor;
import com.chinamcloud.haihe.common.utils.HtmlUtils;
import com.chinamcloud.spider.model.BaseTemplate;
import io.lettuce.core.RedisCommandTimeoutException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.utils.UrlUtils;

@Service
public class WebProcessor
implements BaseProcessor {
    private static final Logger log = LogManager.getLogger(WebProcessor.class);
    private static final long serialVersionUID = 4705312152746624138L;
    @Autowired
    private WebMatch webMatch;
    @Autowired
    private SiteManageService siteManageService;

    @Override
    public List<String> getListUrl(SiteInc siteInc, Site dbSite, String url, String charset, String detailsUrl) {
        ArrayList<String> newsList = new ArrayList<String>(16);
        Integer type = siteInc.getType();
        SpiderTemplate webSpiderTemplate = siteInc.getSpiderTemplate();
        if (webSpiderTemplate == null) {
            if (dbSite == null || dbSite.getSpiderTemplate() == null) {
                throw new CustomException("site_template_cannot_be_empty");
            }
            webSpiderTemplate = dbSite.getSpiderTemplate();
        }
        if (type == null) {
            throw new CustomException("site_type_empty");
        }
        BaseTemplate listUrls = webSpiderTemplate.getListUrls();
        if (listUrls == null) {
            throw new CustomException("listUrls_empty");
        }
        BaseTemplate layoutUrls = null;
        if (type != null && type == 4 && (layoutUrls = webSpiderTemplate.getLayoutUrls()) == null) {
            throw new CustomException("layoutUrls_empty");
        }
        if (StringUtils.isNotBlank((CharSequence)detailsUrl)) {
            newsList.add(detailsUrl);
        } else {
            String htmlStr;
            Html html;
            if (type == 4) {
                LocalDate date = LocalDate.now();
                int year = date.getYear();
                int mouth = date.getMonthValue();
                int day = date.getDayOfMonth();
                url = url.replace("${YYYY}", year + "");
                url = url.replace("${MM}", mouth < 10 ? "0" + mouth : mouth + "");
                url = url.replace("${DD}", day < 10 ? "0" + day : day + "");
                siteInc.setUrl(url);
            }
            if ((html = new Html(htmlStr = this.siteManageService.getHtml(url, charset))).getDocument() == null) {
                throw new CustomException("failed_to_get_HTML");
            }
            if (type == 4) {
                ArrayList<String> layoutUrlList = new ArrayList<String>(16);
                HtmlUtils.getDataListByXPath(html, layoutUrls.getxPath(), layoutUrlList);
                HtmlUtils.getDataListByRegex(html, layoutUrls.getRegex(), layoutUrlList);
                if (layoutUrlList.isEmpty()) {
                    throw new CustomException("get_layout_url_exception");
                }
                for (String s : layoutUrlList) {
                    if ("/".equals(s) || (html = new Html(htmlStr = this.siteManageService.getHtml(url = UrlUtils.canonicalizeUrl((String)s, (String)url), charset))).getDocument() == null) continue;
                    HtmlUtils.getDataListByXPath(html, listUrls.getxPath(), newsList);
                    HtmlUtils.getDataListByRegex(html, listUrls.getRegex(), newsList);
                    if (newsList.size() <= 10) continue;
                    break;
                }
            } else {
                HtmlUtils.getDataListByXPath(html, listUrls.getxPath(), newsList);
                HtmlUtils.getDataListByRegex(html, listUrls.getRegex(), newsList);
            }
            if (newsList.isEmpty()) {
                throw new CustomException("news_links_empty");
            }
        }
        ListIterator<String> iterator = newsList.listIterator();
        while (iterator.hasNext()) {
            String next = (String)iterator.next();
            iterator.set(UrlUtils.canonicalizeUrl((String)next, (String)url));
        }
        return newsList.size() > 10 ? newsList.subList(0, 10) : newsList;
    }

    @Override
    public CrawlerOperation crawlNew(List<String> newsList, SiteInc siteInc) {
        long beginTime = System.currentTimeMillis();
        Integer failedNum = 0;
        ArrayList<HashMap<String, Object>> mapList = new ArrayList<HashMap<String, Object>>();
        String charset = siteInc.getCharset();
        SpiderTemplate spiderTemplate = siteInc.getSpiderTemplate();
        int temp = 0;
        for (String url : newsList) {
            try {
                Thread.sleep(300L);
            }
            catch (InterruptedException e) {
                e.printStackTrace();
            }
            HashMap<String, Object> map = new HashMap<String, Object>();
            map.put("url", url);
            map.put("msg", "");
            String htmlStr = null;
            try {
                htmlStr = this.siteManageService.getHtml(url, charset);
            }
            catch (RedisCommandTimeoutException | IllegalArgumentException e) {
                String msg = (String)map.get("msg");
                msg = msg + MessageSource.getMsg("news_links_exception");
                map.put("msg", msg);
                mapList.add(map);
                return new CrawlerOperation(siteInc.getSiteId(), new Date(beginTime), System.currentTimeMillis() - beginTime, newsList.size(), newsList.size(), mapList, JSON.toJSONString(mapList), 4);
            }
            if (StringUtils.isBlank((CharSequence)htmlStr)) {
                String msg = (String)map.get("msg");
                msg = msg + MessageSource.getMsg("failed_to_get_HTML");
                map.put("msg", msg);
                mapList.add(map);
                continue;
            }
            Html newsHtml = new Html(htmlStr);
            News news = null;
            try {
                news = NewsInfoExtractor.getNewsByHtml(htmlStr, url);
                log.info("\u81ea\u52a8\u751f\u6210\u7684xpath:" + news.toString());
            }
            catch (Exception e) {
                news = new News();
                log.error("\u81ea\u52a8\u83b7\u53d6xpath\u5931\u8d25\uff01");
                e.printStackTrace();
            }
            Future<Integer> titleErrorNum = this.webMatch.getDataByType(spiderTemplate.getSubject(), newsHtml, map, "subject", news.getTitleXPath(), news.getTitle());
            Future<Integer> contentErrorNum = this.webMatch.getDataByType(spiderTemplate.getHtmlDescription(), newsHtml, map, "htmlDescription", news.getContentXPath(), news.getContent());
            Future<Integer> picErrorNum = this.webMatch.getDataByType(spiderTemplate.getPics(), newsHtml, map, "pics", null, null);
            Future<Integer> reprintErrorNum = this.webMatch.getDataByType(spiderTemplate.getReprint(), newsHtml, map, "reprint", news.getSourceXPath(), news.getSource());
            Future<Integer> pubTimeErrorNum = this.webMatch.getDataByType((BaseTemplate)spiderTemplate.getPubTime(), newsHtml, map, "pubTime", news.getTimeXPath(), news.getTime());
            try {
                temp = titleErrorNum.get() + contentErrorNum.get() + picErrorNum.get() + reprintErrorNum.get() + pubTimeErrorNum.get();
            }
            catch (InterruptedException | ExecutionException e) {
                log.error("\u5f02\u6b65\u4efb\u52a1\u8fd4\u56de\u6570\u636e\u5931\u8d25\uff01");
                e.printStackTrace();
            }
            List picsList = (List)map.get("pics");
            if (picsList != null) {
                ListIterator<String> iterator = picsList.listIterator();
                while (iterator.hasNext()) {
                    String next = (String)iterator.next();
                    iterator.set(UrlUtils.canonicalizeUrl((String)next, (String)url));
                }
            }
            mapList.add(map);
            if (temp <= 0) continue;
            Integer n = failedNum;
            Integer n2 = failedNum = Integer.valueOf(failedNum + 1);
            temp = 0;
        }
        return new CrawlerOperation(siteInc.getSiteId(), new Date(beginTime), System.currentTimeMillis() - beginTime, newsList.size(), failedNum, mapList, JSON.toJSONString(mapList), 4);
    }
}

