4
4
5
5
namespace App \YouTube ;
6
6
7
- use Symfony \Component \DomCrawler \Crawler ;
7
+ use MrMySQL \YoutubeTranscript \TranscriptListFetcher ;
8
+ use Symfony \Component \HttpClient \Psr18Client ;
8
9
use Symfony \Contracts \HttpClient \HttpClientInterface ;
9
10
10
11
final class TranscriptFetcher
@@ -16,41 +17,14 @@ public function __construct(
16
17
17
18
public function fetchTranscript (string $ videoId ): string
18
19
{
19
- // Fetch the HTML content of the YouTube video page
20
- $ htmlResponse = $ this ->client ->request ('GET ' , 'https://youtube.com/watch?v= ' .$ videoId );
21
- $ html = $ htmlResponse ->getContent ();
20
+ $ psr18Client = new Psr18Client ($ this ->client );
21
+ $ fetcher = new TranscriptListFetcher ($ psr18Client , $ psr18Client , $ psr18Client );
22
22
23
- // Use DomCrawler to parse the HTML
24
- $ crawler = new Crawler ( $ html );
23
+ $ list = $ fetcher -> fetch ( $ videoId );
24
+ $ transcript = $ list -> findTranscript ( $ list -> getAvailableLanguageCodes () );
25
25
26
- // Extract the script containing the ytInitialPlayerResponse
27
- $ scriptContent = $ crawler ->filter ('script ' )->reduce (function (Crawler $ node ) {
28
- return str_contains ($ node ->text (), 'var ytInitialPlayerResponse = { ' );
29
- })->text ();
30
-
31
- // Extract and parse the JSON data from the script
32
- $ start = strpos ($ scriptContent , 'var ytInitialPlayerResponse = ' ) + strlen ('var ytInitialPlayerResponse = ' );
33
- $ dataString = substr ($ scriptContent , $ start );
34
- $ dataString = substr ($ dataString , 0 , strrpos ($ dataString , '; ' ) ?: null );
35
- $ data = json_decode (trim ($ dataString ), true );
36
-
37
- // Extract the URL for the captions
38
- if (!isset ($ data ['captions ' ]['playerCaptionsTracklistRenderer ' ]['captionTracks ' ][0 ]['baseUrl ' ])) {
39
- throw new \Exception ('Captions are not available for this video. ' );
40
- }
41
- $ captionsUrl = $ data ['captions ' ]['playerCaptionsTracklistRenderer ' ]['captionTracks ' ][0 ]['baseUrl ' ];
42
-
43
- // Fetch and parse the captions XML
44
- $ xmlResponse = $ this ->client ->request ('GET ' , $ captionsUrl );
45
- $ xmlContent = $ xmlResponse ->getContent ();
46
- $ xmlCrawler = new Crawler ($ xmlContent );
47
-
48
- // Collect all text elements from the captions
49
- $ transcript = $ xmlCrawler ->filter ('text ' )->each (function (Crawler $ node ) {
50
- return $ node ->text ().' ' ;
51
- });
52
-
53
- // Combine all the text elements into one string
54
- return implode (PHP_EOL , $ transcript );
26
+ return array_reduce ($ transcript ->fetch (), function (string $ carry , array $ item ): string {
27
+ return $ carry .\PHP_EOL .$ item ['text ' ];
28
+ }, '' );
55
29
}
56
30
}
0 commit comments