Merge pull request #242 from M786453/master

Youtube Playlist Info Scraper
DhanushNehru · Jun 25, 2024 · b2e9937 · b2e9937
2 parents f84a12d + 3f54aee
commit b2e9937
Show file tree

Hide file tree

Showing 4 changed files with 208 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -122,6 +122,7 @@ More information on contributing and the general code of conduct for discussion
 | Word to PDF                          | [Word to PDF](https://github.com/DhanushNehru/Python-Scripts/tree/master/Word%20to%20PDF%20converter)                                         | A Python script to convert an MS Word file to a PDF file.                                                            |
 | Youtube Downloader                   | [Youtube Downloader](https://github.com/DhanushNehru/Python-Scripts/tree/master/Youtube%20Downloader)                                         | Downloads any video from [YouTube](https://youtube.com) in video or audio format!                               
 | Pigeonhole Sort                      | [Algorithm](https://github.com/DhanushNehru/Python-Scripts/tree/master/PigeonHole)                                                            | the pigeonhole sort algorithm to sort your arrays efficiently!
+| Youtube Playlist Info Scraper                      | [Youtube Playlist Info Scraper](https://github.com/DhanushNehru/Python-Scripts/tree/master/Youtube%20Playlist%20Info%20Scraper)                                                            | This python module retrieve information about a YouTube playlist in json format using playlist link.
 
 ## Gitpod
 

diff --git a/Youtube Playlist Info Scraper/Playlist.py b/Youtube Playlist Info Scraper/Playlist.py
@@ -0,0 +1,160 @@
+"""
+This module provides functionalities for YouTube Playlist.
+"""
+
+import requests
+from bs4 import BeautifulSoup
+import json
+
+class Playlist:
+
+    """
+    This class provides methods to perform operatoins for given YouTube Playlist.
+    """
+
+    def __init__(self, playlist_link):
+
+        """
+        Initializes the playlist with a playlist link.
+
+        Parameters:
+            playlist_link (str): Url of YouTube Playlist
+        """
+
+        self.playlist_link = playlist_link
+
+    def info(self):
+
+        """
+        Returns:
+            dict: Information about given Playlist.
+        """
+
+        info = {}
+
+        try:
+
+            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.112 Safari/537.36"}
+
+            response = requests.get(url=self.playlist_link, headers=headers)
+
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            script_elements = soup.find_all('script')
+
+            for e in script_elements:
+
+                if e.text.startswith("var ytInitialData"):
+
+                    data_dict = json.loads(e.text[20:-1])
+
+                    playlist = data_dict["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]
+
+                    if "title" in playlist:
+                        info["title"] = playlist["title"]
+                    else:
+                        info["title"] = ""
+
+                    if "totalVideos" in playlist:
+                        info["totalVideos"] = playlist["totalVideos"]
+                    else:
+                        info["totalVideos"] = ""
+
+                    if "ownerName" in playlist:
+                        info["channelName"] = playlist["ownerName"]["simpleText"]
+                    else:
+                        info["channelName"] = ""
+
+                    if "playlistShareUrl" in playlist:
+                        info["playlistUrl"] = playlist["playlistShareUrl"]
+                    else:
+                        info["playlistUrl"] = ""
+
+                    if "contents" in playlist:
+
+                        playlist_videos = playlist["contents"]
+
+                        info["videos"] = []
+
+                        for video in playlist_videos:
+
+                            video_data = {}
+
+                            video = video["playlistPanelVideoRenderer"]
+
+                            if "title" in video:
+                                video_data["title"] = video["title"]["simpleText"]
+                            else:
+                                video_data["title"] = ""
+
+                            if "lengthText" in video:
+                                video_data["duration"] = video["lengthText"]["simpleText"]
+                            else:
+                                video_data["duration"] = ""
+
+                            if "videoId" in video:
+                                video_data["id"] = video["videoId"]
+                            else:
+                                video_data["id"] = ""
+
+                            info["videos"].append(video_data) # Update info with video
+
+                    info["duration"] = self.__calculatePlaylistDuration(info["videos"])
+
+                    break # Target Element Found; Break loop
+
+        except Exception as e:
+            print("Error in info():", e)
+
+        return info
+
+    def __calculatePlaylistDuration(self, videos_data):
+
+        """
+        Calculate total playlist duration by aggregating the duration of all videos present in playlist.
+
+        Parameters:
+            list: List of videos' data
+        
+        Returns:
+            str: Total duration of Playlist Videos in format -> HH:MM:SS
+        """
+
+        total_duration = "00:00:00"
+
+        try:
+
+            hours, minutes, seconds = 0,0,0
+
+            for video in videos_data:
+
+                video_duration = video["duration"]
+
+                video_duration_parts = video_duration.split(":")
+
+                if len(video_duration_parts) == 3:
+                    hours += int(video_duration_parts[0])
+                    minutes += int(video_duration_parts[1])
+                    seconds += int(video_duration_parts[2])
+
+                if len(video_duration_parts) == 2:
+                    minutes += int(video_duration_parts[0])
+                    seconds += int(video_duration_parts[1])
+
+                if len(video_duration_parts) == 1:
+                    seconds += int(video_duration_parts[0])
+
+            hours += minutes // 60
+
+            minutes = minutes % 60
+
+            minutes += seconds // 60
+
+            seconds = seconds % 60
+
+            total_duration = f"{hours}:{minutes}:{seconds}"
+
+        except Exception as e:
+            print("Error in __calculatePlaylistDuration():", e)
+
+        return total_duration
diff --git a/Youtube Playlist Info Scraper/README.md b/Youtube Playlist Info Scraper/README.md
@@ -0,0 +1,45 @@
+## YouTube Playlist Info Scraper
+
+This python module retrieve information about a YouTube playlist in json format using playlist link.
+
+### Usage:
+
+Install dependencies:
+
+    pip install -r requirements.txt
+
+Import module:
+
+    from Playlist import Playlist
+
+Create Object:
+
+    playlist = Playlist("PLAYLIST_LINK_HERE") # Example: https://www.youtube.com/watch?v=_t2GVaQasRY&list=PLeo1K3hjS3uu_n_a__MI_KktGTLYopZ12
+
+Retrieve Playlist Info:
+
+    info = playlist.info()
+    print(info)
+
+### Output Format:
+
+```
+    {
+        "title": ..., 
+        "totalVideos": ..., 
+        "channelName": ..., 
+        "playlistUrl": ..., 
+        "duration": ...,
+        "videos": [
+            {
+                "title": ..., 
+                "duration": ..., 
+                "id": ...
+            }
+            , 
+            .
+            .
+            .
+            ], 
+    }
+```
diff --git a/Youtube Playlist Info Scraper/requirements.txt b/Youtube Playlist Info Scraper/requirements.txt
@@ -0,0 +1,2 @@
+requests
+bs4