From a2e7fbdc19e594da2fafb827e5eeae6d522937ae Mon Sep 17 00:00:00 2001
From: sundyli <543950155@qq.com>
Date: Sat, 22 Oct 2016 10:48:57 +0800
Subject: [PATCH] Update ContentExtractor : prebuild and update the time regexp
pattern to make ContentExtractor faster and more accurate \nUpdate
.gitignore : exclude unused folders and files in git
---
.gitignore | 12 +-
WebCollector/.idea/.name | 1 -
WebCollector/.idea/compiler.xml | 32 -
.../.idea/copyright/profiles_settings.xml | 3 -
WebCollector/.idea/encodings.xml | 7 -
WebCollector/.idea/misc.xml | 64 -
WebCollector/.idea/workspace.xml | 1286 -----------------
WebCollector/WebCollector.iml | 63 -
.../contentextractor/ContentExtractor.java | 36 +-
9 files changed, 22 insertions(+), 1482 deletions(-)
delete mode 100644 WebCollector/.idea/.name
delete mode 100644 WebCollector/.idea/compiler.xml
delete mode 100644 WebCollector/.idea/copyright/profiles_settings.xml
delete mode 100644 WebCollector/.idea/encodings.xml
delete mode 100644 WebCollector/.idea/misc.xml
delete mode 100644 WebCollector/.idea/workspace.xml
delete mode 100644 WebCollector/WebCollector.iml
diff --git a/.gitignore b/.gitignore
index 7a88658e..19d93d91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,4 @@
-/WebCollector/target/
-/WebCollector/.idea
-/WebCollectorExample/
-/WebCollectorExample/target/
-/Lazy/target/
-/Lazy/.idea
-/WebCollector-Hadoop/target/
-/JSRule/target/
\ No newline at end of file
+WebCollectorExample
+target
+.idea
+*.iml
diff --git a/WebCollector/.idea/.name b/WebCollector/.idea/.name
deleted file mode 100644
index 41e5d474..00000000
--- a/WebCollector/.idea/.name
+++ /dev/null
@@ -1 +0,0 @@
-WebCollector
\ No newline at end of file
diff --git a/WebCollector/.idea/compiler.xml b/WebCollector/.idea/compiler.xml
deleted file mode 100644
index a61dad69..00000000
--- a/WebCollector/.idea/compiler.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/WebCollector/.idea/copyright/profiles_settings.xml b/WebCollector/.idea/copyright/profiles_settings.xml
deleted file mode 100644
index c7d1c5a8..00000000
--- a/WebCollector/.idea/copyright/profiles_settings.xml
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-
\ No newline at end of file
diff --git a/WebCollector/.idea/encodings.xml b/WebCollector/.idea/encodings.xml
deleted file mode 100644
index fe7d837c..00000000
--- a/WebCollector/.idea/encodings.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/WebCollector/.idea/misc.xml b/WebCollector/.idea/misc.xml
deleted file mode 100644
index b31cbbe6..00000000
--- a/WebCollector/.idea/misc.xml
+++ /dev/null
@@ -1,64 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/WebCollector/.idea/workspace.xml b/WebCollector/.idea/workspace.xml
deleted file mode 100644
index 118957a8..00000000
--- a/WebCollector/.idea/workspace.xml
+++ /dev/null
@@ -1,1286 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1450677009297
-
- 1450677009297
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/WebCollector/WebCollector.iml b/WebCollector/WebCollector.iml
deleted file mode 100644
index 806026a9..00000000
--- a/WebCollector/WebCollector.iml
+++ /dev/null
@@ -1,63 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java b/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
index e697decc..c1b5995a 100644
--- a/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
+++ b/WebCollector/src/main/java/cn/edu/hfut/dmic/contentextractor/ContentExtractor.java
@@ -17,13 +17,6 @@
*/
package cn.edu.hfut.dmic.contentextractor;
-import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -34,6 +27,15 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
+
/**
* ContentExtractor could extract content,title,time from news webpage
*
@@ -42,6 +44,8 @@
public class ContentExtractor {
public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class);
+ private static final Pattern TIME_PATTERN = Pattern.compile("[^0-9]+([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3][0-9])[^0-9]{1,5}([0-2][0-9]):([0-5][0-9])");
+ private static final Pattern DATE_PATTERN = Pattern.compile("[^0-9]+([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-3][0-9])");
protected Document doc;
@@ -197,8 +201,6 @@ public News getNews() throws Exception {
}
protected String getTime(Element contentElement) throws Exception {
- String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
- Pattern pattern = Pattern.compile(regex);
Element current = contentElement;
for (int i = 0; i < 2; i++) {
if (current != null && current != doc.body()) {
@@ -213,8 +215,8 @@ protected String getTime(Element contentElement) throws Exception {
break;
}
String currentHtml = current.outerHtml();
- Matcher matcher = pattern.matcher(currentHtml);
- if (matcher.find()) {
+ Matcher matcher = TIME_PATTERN.matcher(currentHtml);
+ if (matcher.find() && matcher.groupCount() >= 6) {
return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
}
if (current != doc.body()) {
@@ -231,8 +233,6 @@ protected String getTime(Element contentElement) throws Exception {
}
protected String getDate(Element contentElement) throws Exception {
- String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
- Pattern pattern = Pattern.compile(regex);
Element current = contentElement;
for (int i = 0; i < 2; i++) {
if (current != null && current != doc.body()) {
@@ -247,8 +247,8 @@ protected String getDate(Element contentElement) throws Exception {
break;
}
String currentHtml = current.outerHtml();
- Matcher matcher = pattern.matcher(currentHtml);
- if (matcher.find()) {
+ Matcher matcher = DATE_PATTERN.matcher(currentHtml);
+ if (matcher.find() && matcher.groupCount() >= 3) {
return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
}
if (current != doc.body()) {
@@ -507,14 +507,14 @@ public static News getNewsByUrl(String url) throws Exception {
}
public static void main(String[] args) throws Exception {
-
- News news = ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html");
+ String url = "https://www.huxiu.com/article/167883.html";
+ News news = ContentExtractor.getNewsByUrl(url);
System.out.println(news.getUrl());
System.out.println(news.getTitle());
System.out.println(news.getTime());
System.out.println(news.getContent());
//System.out.println(news.getContentElement());
-
+ System.out.println(ContentExtractor.getNewsByUrl("http://www.huxiu.com/article/121959/1.html").getTime());
//System.out.println(news);
}