@@ -157,27 +157,47 @@ async function getPageTitleAndLang(url) {
157157 const response = await axios . get ( url , {
158158 timeout : 10000 ,
159159 headers : {
160- "User-Agent" :
161- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" ,
160+ 'User-Agent' : 'Mozilla/5.0 (compatible; Docusaurus-Doc-Crawler/1.0; +https://github.com/radxa-docs/docs)' ,
161+ 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
162+ 'Accept-Language' : 'en-US,en;q=0.5' ,
162163 } ,
163164 } ) ;
164165
165166 const $ = cheerio . load ( response . data ) ;
166167
167- // 获取标题
168- let title = "" ;
169- const titleElement = $ ( "title" ) ;
170- if ( titleElement . length > 0 ) {
171- title = titleElement . text ( ) . trim ( ) ;
172- } else {
173- const h1Element = $ ( "h1" ) ;
174- if ( h1Element . length > 0 ) {
175- title = h1Element . first ( ) . text ( ) . trim ( ) ;
176- } else {
177- title = url . split ( "/" ) . pop ( ) || url ;
178- }
168+ let breadcrumbs = [ ] ;
169+ const breadcrumbLinks = $ ( ".breadcrumbs__link" ) ;
170+
171+ if ( breadcrumbLinks . length > 0 ) {
172+ breadcrumbLinks . each ( ( index , element ) => {
173+ const $link = $ ( element ) ;
174+
175+ let title = "" ;
176+
177+ const directSpan = $link . children ( "span" ) . first ( ) ;
178+ if ( directSpan . length > 0 ) {
179+ title = directSpan . text ( ) . trim ( ) ;
180+ } else {
181+ const nestedSpan = $link . find ( "span" ) . first ( ) ;
182+ if ( nestedSpan . length > 0 ) {
183+ title = nestedSpan . text ( ) . trim ( ) ;
184+ } else {
185+ title = $link . text ( ) . trim ( ) ;
186+ }
187+ }
188+
189+ let url = $link . attr ( "href" ) || "" ;
190+ url = url . trim ( ) ;
191+
192+ breadcrumbs . push ( {
193+ title : title ,
194+ url : url
195+ } ) ;
196+ } ) ;
179197 }
180198
199+ breadcrumbs = breadcrumbs . filter ( item => item . title || item . url ) ;
200+
181201 let lang = detectLanguageFromUrl ( url ) ;
182202 if ( ! lang ) {
183203 lang = detectLanguageFromHtml ( $ ) ;
@@ -191,7 +211,7 @@ async function getPageTitleAndLang(url) {
191211 }
192212 }
193213
194- return { title , lang } ;
214+ return { lang , breadcrumbs } ;
195215 } catch ( error ) {
196216 console . error ( `获取页面标题失败 ${ url } :` , error . message ) ;
197217 let lang = detectLanguageFromUrl ( url ) || "en" ;
@@ -229,8 +249,8 @@ async function updateDataWithTitles(data) {
229249 for ( let i = 0 ; i < urls . length ; i ++ ) {
230250 const url = urls [ i ] ;
231251 console . log ( `正在获取第 ${ i + 1 } /${ urls . length } 个URL的信息: ${ url } ` ) ;
232- const { title , lang } = await getPageTitleAndLang ( url ) ;
233- urlInfo [ url ] = { title , lang } ;
252+ const { lang , breadcrumbs } = await getPageTitleAndLang ( url ) ;
253+ urlInfo [ url ] = { lang , breadcrumbs } ;
234254
235255 await new Promise ( ( resolve ) => setTimeout ( resolve , 500 ) ) ;
236256 }
@@ -239,7 +259,7 @@ async function updateDataWithTitles(data) {
239259 for ( const itemGroup of items ) {
240260 for ( const item of itemGroup ) {
241261 if ( item . name && urlInfo [ item . name ] ) {
242- item . title = urlInfo [ item . name ] . title ;
262+ item . breadcrumbs = urlInfo [ item . name ] . breadcrumbs ;
243263 item . lang = urlInfo [ item . name ] . lang ;
244264 }
245265 }
0 commit comments