-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathyoutube.py
193 lines (145 loc) · 6.57 KB
/
youtube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from googleapiclient.discovery import build
import json
import pandas as pd
import re
class Youtube:
def __init__(self, api_key):
self.api_key = api_key
def get_youtube_api_service(self):
"""
Builds a service object for interacting with the YouTube Data API.
Returns:
googleapiclient.discovery.Resource:
A service object that allows interaction with the YouTube Data API.
"""
youtube_api_service = build('youtube', 'v3', developerKey=self.api_key)
return youtube_api_service
def get_video_id(self, youtube_url):
"""
Extracts the YouTube video ID from a given YouTube video URL.
Args:
- youtube_url (str): The URL of the YouTube video.
Returns:
str or None:
If a valid video ID is found, returns the extracted video ID.
If no video ID is found, returns None.
"""
# Define the regular expression pattern to match the video ID
pattern = re.compile(r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})')
# Use the pattern to find the video ID in the URL
match = pattern.search(youtube_url)
if match:
video_id = match.group(1)
return video_id
else:
return None
def search_videos(self, api_service,keyword, max_results = 5):
"""
Searches for videos on YouTube using the provided API service and a specified keyword.
Args:
- api_service (googleapiclient.discovery.Resource): The YouTube Data API service object created using the 'build' function.
- keyword (str): The search keyword to look for in YouTube videos.
- max_results (int, optional): The maximum number of video results to retrieve. Default is 10.
Returns:
- List[str]: A list of video IDs corresponding to the search results.
"""
search_response = api_service.search().list(
q = keyword,
part = 'id',
type = 'video',
maxResults = max_results,
eventType = 'completed'
).execute()
video_ids = [item['id']['videoId'] for item in search_response['items']]
return video_ids
def get_video_details(self, api_service, video_id):
"""
Retrieves details of a specific YouTube video using the provided API service and video ID.
Args:
- api_service (googleapiclient.discovery.Resource): The YouTube Data API service object created using the 'build' function.
- video_id (str): The ID of the YouTube video for which details are to be retrieved.
Returns:
dict or None:
If details are found, returns a dictionary containing the video's snippet information.
If no details are found, returns None.
"""
video_response = api_service.videos().list(
part='snippet',
id=video_id
).execute()
if 'items' in video_response and video_response['items']:
return video_response['items'][0]['snippet']
else:
return None
def get_data(self,video_id, max_results=100):
"""
Retrieves user comments and related information from a YouTube video using the YouTube Data API.
Args:
- video_id (str): The YouTube video ID for which comments are to be retrieved.
- max_results (int, optional): The maximum number of comments to fetch. Default is 100.
Returns:
- pandas.DataFrame or Exception:
If successful, returns a DataFrame containing comment details such as text, replies, user names, and dates.
If an error occurs during the API request, returns None.
"""
# Initialize the API request
youtube = build('youtube', 'v3', developerKey=self.api_key)
request = youtube.commentThreads().list(
part="snippet, replies",
videoId=video_id,
textFormat="plainText",
maxResults=max_results
)
# Lists to store comments, replies, user names, and dates
comments = []
replies_list = []
user_names = []
dates = []
# Fetching the comments
while len(comments) < max_results and request:
try:
# Execute the API request
response = request.execute()
# Iterate through the response items
for item in response['items']:
# Extract the comment details
comment_snippet = item['snippet']['topLevelComment']['snippet']
if (len(comment_snippet['textDisplay']) > 128):
continue
comments.append(comment_snippet['textDisplay'])
user_names.append(comment_snippet['authorDisplayName'])
dates.append(comment_snippet['publishedAt'])
# Extract replies if available
reply_count = item['snippet']['totalReplyCount']
if reply_count > 0:
replies = [reply['snippet']['textDisplay'] for reply in item['replies']['comments']]
else:
replies = []
replies_list.append(replies)
# Check if there are more pages of comments
request = youtube.commentThreads().list_next(request, response)
except Exception as e:
return None
# Create a DataFrame for collected data
df = pd.DataFrame({"Comment": comments, "Replies": replies_list, "User Name": user_names, "Date": dates})
return df
def save_data(self, df):
"""
Saves a DataFrame containing YouTube video comments and related information to a CSV file.
Args:
- df (pandas.DataFrame): The DataFrame containing comment details such as text, replies, user names, and dates.
Returns:
None
"""
# Save the DataFrame to a CSV file
df.to_csv("comments.csv", index=False, encoding='utf-8')
def get_comments(self, df):
"""
Extracts and returns a list of comments from a DataFrame containing YouTube video comments.
Args:
- df (pandas.DataFrame): The DataFrame containing comment details such as text, replies, user names, and dates.
Returns:
List[str]: A list of comments extracted from the DataFrame.
"""
comments = df["Comment"].tolist()
return comments