/*
 * Decompiled with CFR 0.152.
 */
package com.chinamcloud.haihe.common.spider;

import com.chinamcloud.haihe.common.spider.pojo.News;
import com.chinamcloud.haihe.common.utils.HttpUtils;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.selector.Html;

public class NewsExtractor {
    public static final Logger LOG = LoggerFactory.getLogger(NewsExtractor.class);
    protected Document doc;
    protected HashMap<Element, CountInfo> infoMap = new HashMap();
    protected List<Element> elementList = new ArrayList<Element>();

    NewsExtractor(Document doc) {
        this.doc = doc;
    }

    protected void clean() {
        this.doc.select("script,noscript,style,iframe,br").remove();
    }

    protected CountInfo computeInfo(Node node) {
        int len;
        Element tag = (Element)node;
        Elements children = tag.children();
        if (children == null || children.size() > 0) {
            CountInfo countInfo = new CountInfo();
            for (Node childNode : tag.children()) {
                CountInfo childCountInfo = this.computeInfo(childNode);
                countInfo.textCount += childCountInfo.textCount;
                countInfo.linkTextCount += childCountInfo.linkTextCount;
                countInfo.tagCount += childCountInfo.tagCount;
                countInfo.linkTagCount += childCountInfo.linkTagCount;
                countInfo.leafList.addAll(childCountInfo.leafList);
                countInfo.densitySum += childCountInfo.density;
                if (countInfo.densitySum != 0.0) {
                    System.out.println(123);
                }
                countInfo.pCount += childCountInfo.pCount;
            }
            ++countInfo.tagCount;
            String tagName = tag.tagName();
            if (tagName.equals("a")) {
                countInfo.linkTextCount = countInfo.textCount;
                ++countInfo.linkTagCount;
            } else if (tagName.equals("p")) {
                ++countInfo.pCount;
            }
            int pureLen = countInfo.textCount - countInfo.linkTextCount;
            int len2 = countInfo.tagCount - countInfo.linkTagCount;
            countInfo.density = pureLen == 0 || len2 == 0 ? 0.0 : ((double)pureLen + 0.0) / (double)len2;
            this.infoMap.put(tag, countInfo);
            this.elementList.add(tag);
            return countInfo;
        }
        CountInfo countInfo = new CountInfo();
        String text = tag.text();
        countInfo.textCount = len = text.length();
        countInfo.leafList.add(len);
        this.elementList.add(tag);
        return countInfo;
    }

    protected double computeScore(Element tag) {
        CountInfo countInfo = this.infoMap.get(tag);
        double var = Math.sqrt(this.computeVar(countInfo.leafList) + 1.0);
        double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
        return score;
    }

    protected double computeVar(ArrayList<Integer> data) {
        if (data.size() == 0) {
            return 0.0;
        }
        if (data.size() == 1) {
            return data.get(0) / 2;
        }
        double sum = 0.0;
        for (Integer i : data) {
            sum += (double)i.intValue();
        }
        double ave = sum / (double)data.size();
        sum = 0.0;
        for (Integer i : data) {
            sum += ((double)i.intValue() - ave) * ((double)i.intValue() - ave);
        }
        return sum /= (double)data.size();
    }

    public Element getContentElement() throws Exception {
        this.clean();
        this.computeInfo((Node)this.doc.body());
        double maxScore = 0.0;
        Element content = null;
        for (Map.Entry<Element, CountInfo> entry : this.infoMap.entrySet()) {
            double score;
            Element tag = entry.getKey();
            if (tag.tagName().equals("a") || tag == this.doc.body()) continue;
            Document parse = Jsoup.parse((String)tag.toString());
            parse.select("a").remove();
            if (this.strSim(tag.text(), parse.text()) < 0.2) continue;
            if (tag.text().contains("\u73af\u7403\u7f51\u7efc\u5408\u62a5\u9053\u3011\u636e\u56fd\u5bb6\u536b\u5065\u59d4\u6570\u636e\u7edf\u8ba1")) {
                System.out.println(123);
            }
            if (!((score = this.computeScore(tag)) > maxScore)) continue;
            maxScore = score;
            content = tag;
        }
        if (content == null) {
            throw new Exception("extraction failed");
        }
        return content;
    }

    public News getNews() throws Exception {
        Element contentElement;
        News news = new News();
        try {
            contentElement = this.getContentElement();
            news.setContentElement(contentElement);
            news.setContentXPath(this.getXpath(this.getMinElement(contentElement)));
        }
        catch (Exception ex) {
            LOG.info("news content extraction failed,extraction abort", (Throwable)ex);
            throw new Exception(ex);
        }
        if (this.doc.baseUri() != null) {
            news.setUrl(this.doc.baseUri());
        }
        try {
            Element title = this.getTitle(contentElement);
            news.setTitle(title.text());
            news.setTitleElement(title);
            news.setTitleXPath(this.getXpath(title));
        }
        catch (Exception ex) {
            LOG.info("title extraction failed", (Throwable)ex);
        }
        Element titleElement = news.getTitleElement();
        if (news.getTitleElement() != null) {
            Element notHas = this.isNotHas(contentElement, titleElement);
            news.setContent(notHas.text());
            news.setContentElement(notHas);
            news.setContentXPath(this.getXpath(this.getMinElement(notHas)));
        }
        try {
            this.getTime(news);
            Element timeElement = news.getTimeElement();
            if (timeElement != null) {
                news.setTimeXPath(this.getXpath(timeElement));
            }
        }
        catch (Exception ex) {
            LOG.info("news time extraction failed", (Throwable)ex);
        }
        try {
            this.getSource(news);
            Element sourceElement = news.getSourceElement();
            if (sourceElement != null) {
                news.setSourceXPath(this.getXpath(sourceElement));
            }
        }
        catch (Exception ex) {
            LOG.info("news source extraction failed", (Throwable)ex);
        }
        return news;
    }

    protected Element isNotHas(Element p, Element c) {
        Elements children = p.children();
        if (children != null) {
            for (Element child : children) {
                if (Math.abs(1.0 - this.strSim(child.text(), c.text())) < 0.5 || !(this.strSim(child.text(), p.text()) > 0.9)) continue;
                p = child;
                return p;
            }
        }
        return p;
    }

    protected void getSource(News news) {
        Element timeElement = news.getTimeElement();
        List<Element> elements = null;
        if (timeElement != null) {
            Integer timeIndex = this.elementList.indexOf(timeElement);
            elements = this.elementList.subList(timeIndex >= 10 ? timeIndex - 10 : 0, timeIndex + 10);
        } else {
            Element contentElement = news.getContentElement();
            Element titleElement = news.getTitleElement();
            if (contentElement == null) {
                return;
            }
            Integer contentIndex = this.elementList.indexOf(contentElement);
            Integer titleIndex = 0;
            if (titleElement != null && (titleIndex = Integer.valueOf(this.elementList.indexOf(titleElement))) == -1) {
                titleIndex = this.elementList.indexOf(titleElement.parent());
            }
            Integer childNum = this.getChildNum(contentElement);
            if (titleIndex < contentIndex) {
                elements = this.elementList.subList(titleIndex, contentIndex - childNum);
            }
            if (StringUtils.isBlank((CharSequence)news.getTime())) {
                elements = this.elementList.subList(contentIndex, contentIndex + 10 > this.elementList.size() ? this.elementList.size() : contentIndex + 10);
            }
        }
        this.getSource(elements, news);
    }

    protected boolean getSource(List<Element> elements, News news) {
        int size = elements.size();
        for (int i = 0; i < size; ++i) {
            Element element = elements.get(i);
            Elements children = element.children();
            if (children != null && this.getSource((List<Element>)children, news)) {
                return true;
            }
            if (!element.text().contains("\u6765\u6e90") && !element.text().contains("\u51fa\u5904")) continue;
            if (this.strSim(element.ownText(), "\u6765\u6e90") > 0.5 || this.strSim(element.ownText(), "\u51fa\u5904") > 0.5) {
                if (children != null && children.size() != 0) {
                    String text = null;
                    Element target = null;
                    for (Element child : children) {
                        String reg;
                        Pattern p;
                        Matcher matcher;
                        boolean matches;
                        if (!StringUtils.isNotBlank((CharSequence)child.ownText().trim()) || (matches = (matcher = (p = Pattern.compile(reg = "[`~!@#$%^&*()_\\-+=<>?:\"{}|,.\\/;'\\[\\]\u00b7~\uff01@#\uffe5%\u2026\u2026&*\uff08\uff09\u2014\u2014\\-+={}|\u300a\u300b\uff1f\uff1a\u201c\u201d\u3010\u3011\u3001\uff1b\u2018\u2019\uff0c\u3002\u3001]")).matcher(child.ownText().trim())).matches())) continue;
                        text = child.ownText().trim();
                        target = child;
                    }
                    if (text != null || target != null) {
                        text = element.text().trim();
                        target = element;
                    }
                    news.setSource(text);
                    news.setSourceElement(target);
                }
            } else {
                news.setSource(element.text());
                news.setSourceElement(element);
            }
            return true;
        }
        return false;
    }

    protected void getTime(News news) throws Exception {
        String time;
        Element contentElement = news.getContentElement();
        Element titleElement = news.getTitleElement();
        if (contentElement == null) {
            news.setTime(this.getTime(contentElement));
            return;
        }
        Integer contentIndex = this.elementList.indexOf(contentElement);
        Integer titleIndex = 0;
        if (titleElement != null) {
            titleIndex = this.elementList.indexOf(titleElement);
            if (titleIndex == -1) {
                titleIndex = this.elementList.indexOf(titleElement.parent());
            }
            titleIndex = titleIndex == -1 ? 0 : titleIndex;
        }
        Integer contentChildren = this.getChildNum(contentElement);
        List<Element> elements = null;
        if (titleIndex < contentIndex) {
            if (titleIndex < contentIndex - contentChildren) {
                elements = this.elementList.subList(titleIndex, contentIndex - contentChildren + 5);
            } else if (titleIndex < contentIndex) {
                elements = this.elementList.subList(titleIndex, contentIndex);
            }
        }
        if (elements != null) {
            this.getTimeFromNewsAndElements(news, elements);
        }
        if (StringUtils.isBlank((CharSequence)news.getTime())) {
            elements = this.elementList.subList(contentIndex, contentChildren + contentIndex + 10);
        }
        if (elements != null) {
            this.getTimeFromNewsAndElements(news, elements);
        }
        if (StringUtils.isBlank((CharSequence)(time = news.getTime()))) {
            news.setTime(this.getTime(contentElement));
        }
    }

    protected void getTimeFromNewsAndElements(News news, List<Element> elements) {
        for (Element element : elements) {
            String time = this.getTimeFormRegex(element.ownText());
            if (!StringUtils.isNotBlank((CharSequence)time)) continue;
            news.setTimeElement(element);
            news.setTime(time);
            return;
        }
    }

    protected String getTimeFormRegex(String text) {
        String[] regex = new String[]{"([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})", "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2][0-9])[^0-9]{1,5}?([0-9]{1,2})", "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-3][0-9])", "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"};
        for (int i = 0; i < 4; ++i) {
            Pattern pattern = Pattern.compile(regex[i]);
            Matcher matcher = pattern.matcher(text);
            if (!matcher.find()) continue;
            return matcher.group();
        }
        return "";
    }

    protected String getTime(Element contentElement) throws Exception {
        int i;
        String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
        Pattern pattern = Pattern.compile(regex);
        Element current = contentElement;
        for (i = 0; i < 2; ++i) {
            Element parent;
            if (current == null || current == this.doc.body() || (parent = current.parent()) == null) continue;
            current = parent;
        }
        for (i = 0; i < 6 && current != null; ++i) {
            String currentHtml = current.outerHtml();
            Matcher matcher = pattern.matcher(currentHtml);
            if (matcher.find()) {
                return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
            }
            if (current == this.doc.body()) continue;
            current = current.parent();
        }
        try {
            return this.getDate(contentElement);
        }
        catch (Exception ex) {
            throw new Exception("time not found");
        }
    }

    protected String getDate(Element contentElement) throws Exception {
        int i;
        String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
        Pattern pattern = Pattern.compile(regex);
        Element current = contentElement;
        for (i = 0; i < 2; ++i) {
            Element parent;
            if (current == null || current == this.doc.body() || (parent = current.parent()) == null) continue;
            current = parent;
        }
        for (i = 0; i < 6 && current != null; ++i) {
            String currentHtml = current.outerHtml();
            Matcher matcher = pattern.matcher(currentHtml);
            if (matcher.find()) {
                return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
            }
            if (current == this.doc.body()) continue;
            current = current.parent();
        }
        throw new Exception("date not found");
    }

    protected double strSim(String a, String b) {
        int len1 = a.length();
        int len2 = b.length();
        if (len1 == 0 || len2 == 0) {
            return 0.0;
        }
        double ratio = len1 > len2 ? ((double)len1 + 0.0) / (double)len2 : ((double)len2 + 0.0) / (double)len1;
        if (ratio >= 3.0) {
            return 0.0;
        }
        return ((double)this.lcs(a, b) + 0.0) / (double)Math.max(len1, len2);
    }

    protected Element getTitle(final Element contentElement) throws Exception {
        Elements titles;
        final ArrayList<Element> titleList = new ArrayList<Element>();
        final ArrayList<Double> titleSim = new ArrayList<Double>();
        final AtomicInteger contentIndex = new AtomicInteger();
        final String metaTitle = this.doc.title().trim();
        Element titleEl = this.doc.getElementsByTag("title").first();
        if (!metaTitle.isEmpty()) {
            this.doc.body().traverse(new NodeVisitor(){

                public void head(Node node, int i) {
                    if (node instanceof Element) {
                        Element tag = (Element)node;
                        if (tag == contentElement) {
                            contentIndex.set(titleList.size());
                            return;
                        }
                        String tagName = tag.tagName();
                        if (Pattern.matches("h[1-6]", tagName)) {
                            String title = tag.text().trim();
                            double sim = NewsExtractor.this.strSim(title, metaTitle);
                            titleSim.add(sim);
                            titleList.add(tag);
                        }
                    }
                }

                public void tail(Node node, int i) {
                }
            });
            int index = contentIndex.get();
            int titleSimSize = titleSim.size();
            index = index + 5 < titleSimSize ? (index += 5) : titleSimSize;
            if (index > 0) {
                double maxScore = 0.0;
                int maxIndex = -1;
                for (int i = 0; i < index; ++i) {
                    double score = (double)(i + 1) * (Double)titleSim.get(i);
                    if (!(score > maxScore)) continue;
                    maxScore = score;
                    maxIndex = i;
                }
                if (maxIndex != -1) {
                    return (Element)titleList.get(maxIndex);
                }
            }
        }
        Integer contentIndexOf = this.elementList.indexOf(contentElement);
        Integer titleIndexOf = this.elementList.indexOf(titleEl);
        Elements children = contentElement.children();
        Integer contentChildren = this.getChildNum(contentElement);
        Integer temp = contentIndexOf - contentChildren;
        titleSim.clear();
        titleList.clear();
        List<Element> elementList = this.elementList.subList(temp > 50 ? temp - 50 : (titleIndexOf >= 0 ? titleIndexOf : 0), contentIndexOf);
        for (Element element : elementList) {
            double sim;
            String title = element.text().trim();
            if (StringUtils.isBlank((CharSequence)title) || !((sim = this.strSim(title, metaTitle)) > 0.5)) continue;
            titleSim.add(sim);
            titleList.add(element);
        }
        int index = titleList.size();
        if (index > 0) {
            double maxScore = 0.0;
            int maxIndex = -1;
            for (int i = 0; i < index; ++i) {
                double score = (double)(i + 1) * (Double)titleSim.get(i);
                if (!(Math.abs(1.0 - score) < Math.abs(1.0 - maxScore))) continue;
                maxScore = score;
                maxIndex = i;
            }
            if (maxIndex != -1) {
                return (Element)titleList.get(maxIndex);
            }
        }
        if ((titles = this.doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title],*[id^=subject],*[id$=subject],*[class^=subject],*[class$=subject]")).size() > 0) {
            double max = 0.0;
            Element titleTarger = null;
            for (Element title : titles) {
                Elements titlesChild = title.children();
                if (titlesChild != null) {
                    for (Element element : titlesChild) {
                        String tagName = element.tagName();
                        if (Pattern.matches("h[1-6]", tagName)) {
                            return element;
                        }
                        double sim = this.strSim(element.text(), contentElement.text());
                        if (!(sim > max)) continue;
                        max = sim;
                        titleTarger = element;
                    }
                }
                if (max > 0.0) {
                    return titleTarger;
                }
                if (title.text().length() <= 5 || title.text().length() >= 40) continue;
                return title;
            }
        }
        try {
            return titleEl;
        }
        catch (Exception ex) {
            throw new Exception("title not found");
        }
    }

    protected String getTitleByEditDistance(Element contentElement) throws Exception {
        final String metaTitle = this.doc.title();
        final ArrayList<Double> max = new ArrayList<Double>();
        max.add(0.0);
        final StringBuilder sb = new StringBuilder();
        this.doc.body().traverse(new NodeVisitor(){

            public void head(Node node, int i) {
                TextNode tn;
                String text;
                double sim;
                if (node instanceof TextNode && (sim = NewsExtractor.this.strSim(text = (tn = (TextNode)node).text().trim(), metaTitle)) > 0.0 && sim > (Double)max.get(0)) {
                    max.set(0, sim);
                    sb.setLength(0);
                    sb.append(text);
                }
            }

            public void tail(Node node, int i) {
            }
        });
        if (sb.length() > 0) {
            return sb.toString();
        }
        throw new Exception();
    }

    protected int lcs(String x, String y) {
        int M = x.length();
        int N = y.length();
        if (M == 0 || N == 0) {
            return 0;
        }
        int[][] opt = new int[M + 1][N + 1];
        for (int i = M - 1; i >= 0; --i) {
            for (int j = N - 1; j >= 0; --j) {
                opt[i][j] = x.charAt(i) == y.charAt(j) ? opt[i + 1][j + 1] + 1 : Math.max(opt[i + 1][j], opt[i][j + 1]);
            }
        }
        return opt[0][0];
    }

    protected int editDistance(String word1, String word2) {
        int i;
        int len1 = word1.length();
        int len2 = word2.length();
        int[][] dp = new int[len1 + 1][len2 + 1];
        for (i = 0; i <= len1; ++i) {
            dp[i][0] = i;
        }
        for (int j = 0; j <= len2; ++j) {
            dp[0][j] = j;
        }
        for (i = 0; i < len1; ++i) {
            char c1 = word1.charAt(i);
            for (int j = 0; j < len2; ++j) {
                char c2 = word2.charAt(j);
                if (c1 == c2) {
                    dp[i + 1][j + 1] = dp[i][j];
                    continue;
                }
                int replace = dp[i][j] + 1;
                int insert = dp[i][j + 1] + 1;
                int delete = dp[i + 1][j] + 1;
                int min = replace > insert ? insert : replace;
                dp[i + 1][j + 1] = min = delete > min ? min : delete;
            }
        }
        return dp[len1][len2];
    }

    public static Element getContentElementByDoc(Document doc) throws Exception {
        NewsExtractor ce = new NewsExtractor(doc);
        return ce.getContentElement();
    }

    public static Element getContentElementByHtml(String html) throws Exception {
        Document doc = Jsoup.parse((String)html);
        return NewsExtractor.getContentElementByDoc(doc);
    }

    public static Element getContentElementByHtml(String html, String url) throws Exception {
        Document doc = Jsoup.parse((String)html, (String)url);
        return NewsExtractor.getContentElementByDoc(doc);
    }

    public static String getContentByDoc(Document doc) throws Exception {
        NewsExtractor ce = new NewsExtractor(doc);
        return ce.getContentElement().text();
    }

    public static String getContentByHtml(String html) throws Exception {
        Document doc = Jsoup.parse((String)html);
        return NewsExtractor.getContentElementByDoc(doc).text();
    }

    public static String getContentByHtml(String html, String url) throws Exception {
        Document doc = Jsoup.parse((String)html, (String)url);
        return NewsExtractor.getContentElementByDoc(doc).text();
    }

    public static News getNewsByDoc(Document doc) throws Exception {
        NewsExtractor ce = new NewsExtractor(doc);
        return ce.getNews();
    }

    public static News getNewsByHtml(String html) throws Exception {
        Document doc = Jsoup.parse((String)html);
        return NewsExtractor.getNewsByDoc(doc);
    }

    public static News getNewsByHtml(String html, String url) throws Exception {
        Document doc = Jsoup.parse((String)html, (String)url);
        return NewsExtractor.getNewsByDoc(doc);
    }

    public static News getNewsByUrl(String url) throws Exception {
        HttpClientDownloader downloader = new HttpClientDownloader();
        Html html = null;
        try {
            html = downloader.download(url);
        }
        catch (Exception e) {
            e.printStackTrace();
            HttpUtils.HttpRequestResult httpRequestResult = HttpUtils.executeGet(url, null, "utf-8");
            html = new Html(httpRequestResult.getMsg());
        }
        return NewsExtractor.getNewsByHtml(html.toString(), url);
    }

    public static void main(String[] args) {
        String[] regex = new String[]{"([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})", "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2][0-9])[^0-9]{1,5}?([0-9]{1,2})", "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-3][0-9])", "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})", "([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-3][0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})", "([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-3][0-9])[^0-9]{1,5}?([0-9]{1,2})", "([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"};
        String[] timeStr = new String[]{"yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm", "yyyy-MM-dd HH", "yyyy-MM-dd", "MM-dd HH:mm:ss", "MM-dd HH:mm", "MM-dd"};
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH");
        for (int i = 0; i < regex.length; ++i) {
            Pattern pattern = Pattern.compile(regex[i]);
            Matcher matcher = pattern.matcher("\u53d1\u5e03\u65f6\u95f4\uff1a03-05 20:52");
            if (!matcher.find()) continue;
            String group = matcher.group();
            System.out.println(group);
            dateFormat = new SimpleDateFormat(timeStr[i]);
            try {
                Date parse = dateFormat.parse(group);
                Calendar calendar = Calendar.getInstance();
                calendar.setTime(parse);
                int year = calendar.get(1);
                if (year == 1970) {
                    Calendar now = Calendar.getInstance();
                    now.setTime(new Date(System.currentTimeMillis()));
                    calendar.set(1, now.get(1));
                }
                dateFormat = new SimpleDateFormat(timeStr[0]);
                System.out.println(dateFormat.format(calendar.getTime()));
            }
            catch (ParseException e) {
                e.printStackTrace();
            }
            return;
        }
    }

    public static void man(String[] args) throws Exception {
        String[] urls;
        for (String url : urls = new String[]{"https://3w.huanqiu.com/a/c36dc8/9CaKrnKpRiU?agt=8", "https://baijiahao.baidu.com/s?id=1660860287770658064"}) {
            new Thread(() -> {
                News news = null;
                try {
                    news = NewsExtractor.getNewsByUrl(url);
                    System.out.println(news.getUrl());
                    System.out.println(news.getTitleXPath() + "-----" + news.getTitle());
                    System.out.println(news.getTimeXPath() + "-----" + news.getTime());
                    System.out.println(news.getContentXPath() + "-----" + news.getContent());
                    System.out.println(news.getSourceXPath() + "-----" + news.getSource());
                }
                catch (Exception e) {
                    e.printStackTrace();
                    System.out.println("\u5f02\u5e38\uff01");
                }
            }).start();
        }
    }

    private Element getMinElement(Element element) {
        Elements children = element.children();
        if (children != null) {
            for (Element child : children) {
                if (!(this.strSim(child.text(), element.text()) > 0.9)) continue;
                return this.getMinElement(child);
            }
        }
        return element;
    }

    private String getXpath(Element element) {
        String tagName = element.tagName();
        if (tagName.equals("title")) {
            return null;
        }
        String id = element.id();
        if (StringUtils.isNotBlank((CharSequence)id)) {
            return "//" + tagName + "[@id=\"" + id + "\"]";
        }
        String className = element.className();
        Elements elementsByClass = null;
        if (StringUtils.isNotBlank((CharSequence)className)) {
            elementsByClass = this.doc.getElementsByClass(className);
        }
        Element parent = element.parent();
        if (elementsByClass == null || elementsByClass.size() > 1) {
            String xpathIndex = this.getXpathIndex(parent, element);
            String xPath = this.getXpath(parent) + "/" + tagName;
            return xPath + xpathIndex;
        }
        return "//" + tagName + "[@class=\"" + className + "\"]";
    }

    private String getXpathIndex(Element parent, Element target) {
        Elements elements = parent.children();
        String tagName = target.tagName();
        String className = target.className();
        String id = target.id();
        if (StringUtils.isNotBlank((CharSequence)id)) {
            return "[@id='" + id + "']";
        }
        Integer num = 0;
        Integer time = 0;
        Integer temp = 0;
        int size = elements.size();
        for (int i = 0; i < size; ++i) {
            String class_name;
            Integer n;
            Element element = (Element)elements.get(i);
            String tag_name = element.tagName();
            if (StringUtils.isNotBlank((CharSequence)tag_name) && tagName.equals(tag_name)) {
                Integer n2 = time;
                n = time = Integer.valueOf(time + 1);
            }
            if (StringUtils.isNotBlank((CharSequence)(class_name = element.className())) && StringUtils.isNotBlank((CharSequence)className) && className.equals(class_name)) {
                n = temp;
                Integer n3 = temp = Integer.valueOf(temp + 1);
            }
            if (!element.equals((Object)target)) continue;
            num = i;
        }
        if (temp > 0) {
            return "[@class='" + className + "']";
        }
        if (time > 1) {
            return "[" + (num + 1) + "]";
        }
        return "";
    }

    protected Integer getChildNum(Element element) {
        Integer time = 0;
        Elements children = element.children();
        if (children != null) {
            time = time + children.size();
            for (Element child : children) {
                time = time + this.getChildNum(child);
            }
        }
        return time;
    }

    class CountInfo {
        int textCount = 0;
        int linkTextCount = 0;
        int tagCount = 0;
        int linkTagCount = 0;
        double density = 0.0;
        double densitySum = 0.0;
        double score = 0.0;
        int pCount = 0;
        ArrayList<Integer> leafList = new ArrayList();

        CountInfo() {
        }
    }
}

