From a2e7fbdc19e594da2fafb827e5eeae6d522937ae Mon Sep 17 00:00:00 2001 From: sundyli <543950155@qq.com> Date: Sat, 22 Oct 2016 10:48:57 +0800 Subject: [PATCH] Update ContentExtractor : prebuild and update the time regexp pattern to make ContentExtractor faster and more accurate \nUpdate .gitignore : exclude unused folders and files in git --- .gitignore | 12 +- WebCollector/.idea/.name | 1 - WebCollector/.idea/compiler.xml | 32 - .../.idea/copyright/profiles_settings.xml | 3 - WebCollector/.idea/encodings.xml | 7 - WebCollector/.idea/misc.xml | 64 - WebCollector/.idea/workspace.xml | 1286 ----------------- WebCollector/WebCollector.iml | 63 - .../contentextractor/ContentExtractor.java | 36 +- 9 files changed, 22 insertions(+), 1482 deletions(-) delete mode 100644 WebCollector/.idea/.name delete mode 100644 WebCollector/.idea/compiler.xml delete mode 100644 WebCollector/.idea/copyright/profiles_settings.xml delete mode 100644 WebCollector/.idea/encodings.xml delete mode 100644 WebCollector/.idea/misc.xml delete mode 100644 WebCollector/.idea/workspace.xml delete mode 100644 WebCollector/WebCollector.iml diff --git a/.gitignore b/.gitignore index 7a88658e..19d93d91 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,4 @@ -/WebCollector/target/ -/WebCollector/.idea -/WebCollectorExample/ -/WebCollectorExample/target/ -/Lazy/target/ -/Lazy/.idea -/WebCollector-Hadoop/target/ -/JSRule/target/ \ No newline at end of file +WebCollectorExample +target +.idea +*.iml diff --git a/WebCollector/.idea/.name b/WebCollector/.idea/.name deleted file mode 100644 index 41e5d474..00000000 --- a/WebCollector/.idea/.name +++ /dev/null @@ -1 +0,0 @@ -WebCollector \ No newline at end of file diff --git a/WebCollector/.idea/compiler.xml b/WebCollector/.idea/compiler.xml deleted file mode 100644 index a61dad69..00000000 --- a/WebCollector/.idea/compiler.xml +++ /dev/null @@ -1,32 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/WebCollector/.idea/copyright/profiles_settings.xml b/WebCollector/.idea/copyright/profiles_settings.xml deleted file mode 100644 index c7d1c5a8..00000000 --- a/WebCollector/.idea/copyright/profiles_settings.xml +++ /dev/null @@ -1,3 +0,0 @@ - - - \ No newline at end of file diff --git a/WebCollector/.idea/encodings.xml b/WebCollector/.idea/encodings.xml deleted file mode 100644 index fe7d837c..00000000 --- a/WebCollector/.idea/encodings.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/WebCollector/.idea/misc.xml b/WebCollector/.idea/misc.xml deleted file mode 100644 index b31cbbe6..00000000 --- a/WebCollector/.idea/misc.xml +++ /dev/null @@ -1,64 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/WebCollector/.idea/workspace.xml b/WebCollector/.idea/workspace.xml deleted file mode 100644 index 118957a8..00000000 --- a/WebCollector/.idea/workspace.xml +++ /dev/null @@ -1,1286 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1450677009297 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/WebCollector/WebCollector.iml b/WebCollector/WebCollector.iml deleted file mode 100644 index 806026a9..00000000 --- a/WebCollector/WebCollector.iml +++ /dev/null @@ -1,63 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java b/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java index e697decc..c1b5995a 100644 --- a/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java +++ b/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java @@ -17,13 +17,6 @@ */ package cn.edu.hfut.dmic.contentextractor; -import cn.edu.hfut.dmic.webcollector.net.HttpRequest; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -34,6 +27,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import cn.edu.hfut.dmic.webcollector.net.HttpRequest; + /** * ContentExtractor could extract content,title,time from news webpage * @@ -42,6 +44,8 @@ public class ContentExtractor { public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class); + private static final Pattern TIME_PATTERN = Pattern.compile("[^0-9]+([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3][0-9])[^0-9]{1,5}([0-2][0-9]):([0-5][0-9])"); + private static final Pattern DATE_PATTERN = Pattern.compile("[^0-9]+([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3][0-9])"); protected Document doc; @@ -197,8 +201,6 @@ public News getNews() throws Exception { } protected String getTime(Element contentElement) throws Exception { - String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"; - Pattern pattern = Pattern.compile(regex); Element current = contentElement; for (int i = 0; i < 2; i++) { if (current != null && current != doc.body()) { @@ -213,8 +215,8 @@ protected String getTime(Element contentElement) throws Exception { break; } String currentHtml = current.outerHtml(); - Matcher matcher = pattern.matcher(currentHtml); - if (matcher.find()) { + Matcher matcher = TIME_PATTERN.matcher(currentHtml); + if (matcher.find() && matcher.groupCount() >= 6) { return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6); } if (current != doc.body()) { @@ -231,8 +233,6 @@ protected String getTime(Element contentElement) throws Exception { } protected String getDate(Element contentElement) throws Exception { - String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"; - Pattern pattern = Pattern.compile(regex); Element current = contentElement; for (int i = 0; i < 2; i++) { if (current != null && current != doc.body()) { @@ -247,8 +247,8 @@ protected String getDate(Element contentElement) throws Exception { break; } String currentHtml = current.outerHtml(); - Matcher matcher = pattern.matcher(currentHtml); - if (matcher.find()) { + Matcher matcher = DATE_PATTERN.matcher(currentHtml); + if (matcher.find() && matcher.groupCount() >= 3) { return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3); } if (current != doc.body()) { @@ -507,14 +507,14 @@ public static News getNewsByUrl(String url) throws Exception { } public static void main(String[] args) throws Exception { - - News news = ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html"); + String url = "https://www.huxiu.com/article/167883.html"; + News news = ContentExtractor.getNewsByUrl(url); System.out.println(news.getUrl()); System.out.println(news.getTitle()); System.out.println(news.getTime()); System.out.println(news.getContent()); //System.out.println(news.getContentElement()); - + System.out.println(ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html").getTime()); //System.out.println(news); }