@@ -2,6 +2,7 @@ const axios = require("axios");
22const fs = require ( "fs" ) ;
33const path = require ( "path" ) ;
44const cheerio = require ( "cheerio" ) ;
5+ const puppeteer = require ( 'puppeteer' ) ;
56
67function getAccessToken ( ) {
78 return process . env . BAIDU_TONGJI_ACCESS_TOKEN || "" ;
@@ -153,44 +154,71 @@ function detectLanguageFromHtml($) {
153154
154155// 获取页面标题和语言
155156async function getPageTitleAndLang ( url ) {
157+ let browser ;
158+
156159 try {
157- const response = await axios . get ( url , {
158- timeout : 10000 ,
159- headers : {
160- 'User-Agent' : 'Mozilla/5.0 (compatible; Docusaurus-Doc-Crawler/1.0; +https://github.com/radxa-docs/docs)' ,
161- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
162- 'Accept-Language' : 'en-US,en;q=0.5' ,
163- } ,
160+ browser = await puppeteer . launch ( {
161+ headless : true ,
162+ args : [
163+ '--no-sandbox' ,
164+ '--disable-setuid-sandbox' ,
165+ '--disable-web-security' ,
166+ '--disable-features=IsolateOrigins,site-per-process' ,
167+ ] ,
168+ } ) ;
169+
170+ const page = await browser . newPage ( ) ;
171+
172+ await page . setUserAgent ( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ) ;
173+
174+ await page . goto ( url , {
175+ waitUntil : 'networkidle2' ,
176+ timeout : 30000 ,
164177 } ) ;
165178
166- const $ = cheerio . load ( response . data ) ;
167-
179+ // 获取页面HTML
180+ const html = await page . content ( ) ;
181+
182+ const $ = cheerio . load ( html ) ;
183+ let title = "" ;
184+ const titleElement = $ ( "title" ) ;
185+ if ( titleElement . length > 0 ) {
186+ title = titleElement . text ( ) . trim ( ) ;
187+ } else {
188+ const h1Element = $ ( "h1" ) ;
189+ if ( h1Element . length > 0 ) {
190+ title = h1Element . first ( ) . text ( ) . trim ( ) ;
191+ } else {
192+ title = url . split ( "/" ) . pop ( ) || url ;
193+ }
194+ }
195+
168196 let breadcrumbs = [ ] ;
169197 const breadcrumbLinks = $ ( ".breadcrumbs__link" ) ;
170198
171199 if ( breadcrumbLinks . length > 0 ) {
172200 breadcrumbLinks . each ( ( index , element ) => {
173201 const $link = $ ( element ) ;
174202
175- let title = "" ;
203+ let _title = "" ;
176204
177205 const directSpan = $link . children ( "span" ) . first ( ) ;
178206 if ( directSpan . length > 0 ) {
179- title = directSpan . text ( ) . trim ( ) ;
207+ _title = directSpan . text ( ) . trim ( ) ;
180208 } else {
181209 const nestedSpan = $link . find ( "span" ) . first ( ) ;
182210 if ( nestedSpan . length > 0 ) {
183- title = nestedSpan . text ( ) . trim ( ) ;
211+ _title = nestedSpan . text ( ) . trim ( ) ;
184212 } else {
185- title = $link . text ( ) . trim ( ) ;
213+ _title = $link . text ( ) . trim ( ) ;
186214 }
187215 }
188216
189217 let url = $link . attr ( "href" ) || "" ;
190218 url = url . trim ( ) ;
191219
192220 breadcrumbs . push ( {
193- title : title ,
221+ title : _title ,
194222 url : url
195223 } ) ;
196224 } ) ;
@@ -211,9 +239,9 @@ async function getPageTitleAndLang(url) {
211239 }
212240 }
213241
214- return { lang, breadcrumbs } ;
242+ return { lang, breadcrumbs, title } ;
215243 } catch ( error ) {
216- console . error ( `获取页面标题失败 ${ url } :` , error . message ) ;
244+ console . error ( `获取页面数据失败 ${ url } :` , error . message ) ;
217245 let lang = detectLanguageFromUrl ( url ) || "en" ;
218246 return {
219247 title : url . split ( "/" ) . pop ( ) || url ,
@@ -249,8 +277,8 @@ async function updateDataWithTitles(data) {
249277 for ( let i = 0 ; i < urls . length ; i ++ ) {
250278 const url = urls [ i ] ;
251279 console . log ( `正在获取第 ${ i + 1 } /${ urls . length } 个URL的信息: ${ url } ` ) ;
252- const { lang, breadcrumbs } = await getPageTitleAndLang ( url ) ;
253- urlInfo [ url ] = { lang, breadcrumbs } ;
280+ const { lang, breadcrumbs, title } = await getPageTitleAndLang ( url ) ;
281+ urlInfo [ url ] = { lang, breadcrumbs, title } ;
254282
255283 await new Promise ( ( resolve ) => setTimeout ( resolve , 500 ) ) ;
256284 }
@@ -261,6 +289,7 @@ async function updateDataWithTitles(data) {
261289 if ( item . name && urlInfo [ item . name ] ) {
262290 item . breadcrumbs = urlInfo [ item . name ] . breadcrumbs ;
263291 item . lang = urlInfo [ item . name ] . lang ;
292+ item . title = urlInfo [ item . name ] . title ;
264293 }
265294 }
266295 }
0 commit comments