crawler

eMUQI · eMUQI · commit 58125214f3ef · 2019-10-31T14:02:45.000+08:00
diff --git a/crawler/1_getsource.py b/crawler/1_getsource.py
@@ -0,0 +1,7 @@
+from urllib.request import urlopen
+
+# if has Chinese, apply decode()
+html = urlopen(
+    "https://morvanzhou.github.io/static/scraping/basic-structure.html"
+).read().decode('utf-8')
+print(html)
diff --git a/crawler/2_match.py b/crawler/2_match.py
@@ -0,0 +1,19 @@
+from urllib.request import urlopen
+import re
+
+# if has Chinese, apply decode()
+html = urlopen(
+    "https://morvanzhou.github.io/static/scraping/basic-structure.html"
+).read().decode('utf-8')
+print(html)
+
+print("-"*100)
+
+res = re.findall(r"<title>(.+?)</title>", html)
+print("\nPage title is: ", res[0])
+
+res1 = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL)    # re.DOTALL if multi line
+print("\nPage paragraph is: ", res1[0])
+
+res = re.findall(r'href="(.*?)"', html)
+print("\nAll links: ", res)
diff --git a/crawler/3_bs4test.py b/crawler/3_bs4test.py
@@ -0,0 +1,19 @@
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+
+#pip3 install beautifulsoup4
+#pip3 install lxml
+
+# if has Chinese, apply decode()
+html = urlopen(
+    "https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
+print(html)
+print("-"*100)
+
+soup = BeautifulSoup(html, features='lxml')
+print(soup.h1)
+print('\n', soup.p)
+
+all_href = soup.find_all('a')
+all_href = [l['href'] for l in all_href]
+print('\n', all_href)