Skip to content

Commit d5091ad

Browse files
authored
Merge pull request #1085 from xzuoqi/main
fix: add puppeteer dependency
2 parents 055d6ed + 0f3a34e commit d5091ad

File tree

1 file changed

+47
-18
lines changed

1 file changed

+47
-18
lines changed

.github/scripts/baidu-tongji.js

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ const axios = require("axios");
22
const fs = require("fs");
33
const path = require("path");
44
const cheerio = require("cheerio");
5+
const puppeteer = require('puppeteer');
56

67
function getAccessToken() {
78
return process.env.BAIDU_TONGJI_ACCESS_TOKEN || "";
@@ -153,44 +154,71 @@ function detectLanguageFromHtml($) {
153154

154155
// 获取页面标题和语言
155156
async function getPageTitleAndLang(url) {
157+
let browser;
158+
156159
try {
157-
const response = await axios.get(url, {
158-
timeout: 10000,
159-
headers: {
160-
'User-Agent': 'Mozilla/5.0 (compatible; Docusaurus-Doc-Crawler/1.0; +https://github.com/radxa-docs/docs)',
161-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
162-
'Accept-Language': 'en-US,en;q=0.5',
163-
},
160+
browser = await puppeteer.launch({
161+
headless: true,
162+
args: [
163+
'--no-sandbox',
164+
'--disable-setuid-sandbox',
165+
'--disable-web-security',
166+
'--disable-features=IsolateOrigins,site-per-process',
167+
],
168+
});
169+
170+
const page = await browser.newPage();
171+
172+
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
173+
174+
await page.goto(url, {
175+
waitUntil: 'networkidle2',
176+
timeout: 30000,
164177
});
165178

166-
const $ = cheerio.load(response.data);
167-
179+
// 获取页面HTML
180+
const html = await page.content();
181+
182+
const $ = cheerio.load(html);
183+
let title = "";
184+
const titleElement = $("title");
185+
if (titleElement.length > 0) {
186+
title = titleElement.text().trim();
187+
} else {
188+
const h1Element = $("h1");
189+
if (h1Element.length > 0) {
190+
title = h1Element.first().text().trim();
191+
} else {
192+
title = url.split("/").pop() || url;
193+
}
194+
}
195+
168196
let breadcrumbs = [];
169197
const breadcrumbLinks = $(".breadcrumbs__link");
170198

171199
if (breadcrumbLinks.length > 0) {
172200
breadcrumbLinks.each((index, element) => {
173201
const $link = $(element);
174202

175-
let title = "";
203+
let _title = "";
176204

177205
const directSpan = $link.children("span").first();
178206
if (directSpan.length > 0) {
179-
title = directSpan.text().trim();
207+
_title = directSpan.text().trim();
180208
} else {
181209
const nestedSpan = $link.find("span").first();
182210
if (nestedSpan.length > 0) {
183-
title = nestedSpan.text().trim();
211+
_title = nestedSpan.text().trim();
184212
} else {
185-
title = $link.text().trim();
213+
_title = $link.text().trim();
186214
}
187215
}
188216

189217
let url = $link.attr("href") || "";
190218
url = url.trim();
191219

192220
breadcrumbs.push({
193-
title: title,
221+
title: _title,
194222
url: url
195223
});
196224
});
@@ -211,9 +239,9 @@ async function getPageTitleAndLang(url) {
211239
}
212240
}
213241

214-
return { lang, breadcrumbs };
242+
return { lang, breadcrumbs, title };
215243
} catch (error) {
216-
console.error(`获取页面标题失败 ${url}:`, error.message);
244+
console.error(`获取页面数据失败 ${url}:`, error.message);
217245
let lang = detectLanguageFromUrl(url) || "en";
218246
return {
219247
title: url.split("/").pop() || url,
@@ -249,8 +277,8 @@ async function updateDataWithTitles(data) {
249277
for (let i = 0; i < urls.length; i++) {
250278
const url = urls[i];
251279
console.log(`正在获取第 ${i + 1}/${urls.length} 个URL的信息: ${url}`);
252-
const { lang, breadcrumbs } = await getPageTitleAndLang(url);
253-
urlInfo[url] = { lang, breadcrumbs };
280+
const { lang, breadcrumbs, title } = await getPageTitleAndLang(url);
281+
urlInfo[url] = { lang, breadcrumbs, title };
254282

255283
await new Promise((resolve) => setTimeout(resolve, 500));
256284
}
@@ -261,6 +289,7 @@ async function updateDataWithTitles(data) {
261289
if (item.name && urlInfo[item.name]) {
262290
item.breadcrumbs = urlInfo[item.name].breadcrumbs;
263291
item.lang = urlInfo[item.name].lang;
292+
item.title = urlInfo[item.name].title;
264293
}
265294
}
266295
}

0 commit comments

Comments
 (0)