-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtonghuashun_crawler.py
104 lines (93 loc) · 4.23 KB
/
tonghuashun_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import asyncio
from langchain_cohere import CohereEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from playwright.async_api import async_playwright
from datetime import datetime
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
async def run():
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto("https://news.10jqka.com.cn/realtimenews.html")
attempts = 0
max_attempts = 5
while attempts < max_attempts:
try:
await page.goto("https://news.10jqka.com.cn/realtimenews.html")
# 等待页面加载完成
await page.wait_for_selector("ul.newsText.all")
break
except Exception as e:
attempts += 1
# 模拟点击“加载更多”按钮20次
for _ in range(20):
try:
# 等待“加载更多”按钮可点击
await page.wait_for_selector("div.downLoadMore")
await page.click("div.downLoadMore")
# 等待新的内容加载
await page.wait_for_timeout(4000) # 等待2秒,确保内容加载完成
except Exception as e:
print(f"加载更多内容时出错: {e}")
break
# 获取所有符合条件的<li>元素
elements = await page.query_selector_all("ul.newsText.all li[class^='stock_']")
text_content =''
# 遍历所有<li>元素,提取时间和链接
for element in elements:
# 提取时间
time_element = await element.query_selector("div.newsTimer")
time_text = await time_element.inner_text() if time_element else "N/A"
# 合并当前日期和爬取到的时间
if time_text != "N/A":
current_date = datetime.now().strftime("%Y-%m-%d")
combined_time = f"{current_date}-{time_text}"
else:
combined_time = "N/A"
# 提取链接
a_element = await element.query_selector("a")
href = await a_element.get_attribute("href") if a_element else "N/A"
content = await a_element.inner_text() if a_element else "N/A"
text_content+=content+'发布时间为:'+combined_time+'。网页源链接:'+href+'\n'
await browser.close()
documents = [Document(page_content=text_content)]
return documents
# def convert_to_documents(news_data):
# documents = []
# for i, item in enumerate(news_data):
# page_content = item["内容"]+'网页源链接:'+item["网页链接"]+'。发布时间:'+item["发布时间"]
# documents.append(Document(page_content=page_content))
# return documents
def retrieve_text(docs):
# Split,Emdedding the document
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,chunk_overlap = 200)
splits = text_splitter.split_documents(docs)
embeddings = CohereEmbeddings(model="embed-multilingual-v3.0")
current_date = datetime.now().strftime("%Y%m%d%H%M")
collection_name = "vectorstore_"+current_date
db_name = "./chroma_db_"+current_date
Chroma.from_documents(collection_name=collection_name,documents=splits, embedding=embeddings,persist_directory=db_name)
async def cjyw_run():
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto("https://news.10jqka.com.cn/today_list/")
attempts = 0
max_attempts = 5
while attempts < max_attempts:
try:
await page.goto("https://news.10jqka.com.cn/today_list/")
# 等待页面加载完成
await page.wait_for_selector("ul.newsText.all")
break
except Exception as e:
attempts += 1
if __name__ == '__main__':
'''同花顺7*24小时新闻爬取'''
# documents = asyncio.run(run())
# print(documents)
# # docs=convert_to_documents(news_data)
# retrieve_text(documents)
'''同花顺分板块爬取'''
cjyw_docs = asyncio.run(cjyw_run()) #财经要闻