-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
162 lines (127 loc) · 5.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import time
import csv
import sys
from random import randint
from source.youtube_scraping_functions import get_video_info
from source.youtube_get_transcription import get_transcription
from source.youtube_scraping_functions import get_youtube_urls
# Creation of youtubevideo class
class youtubevideo:
"""
Class for the videos to be stored in
"""
def __init__(self, url, title, description, views, published, likes, dislikes, channel_name, channel_url,
channel_subscribers, transcription):
self.url = url
self.title = title
self.description = description
self.views = views
self.published = published
self.likes = likes
self.dislikes = dislikes
self.channel_name = channel_name
self.channel_url = channel_url
self.channel_subscribers = channel_subscribers
self.transcription = transcription
def get_like_ratio(self):
# Gets the ratio of likes to dislikes
ratio = self.likes / self.dislikes
return ratio
# Creates a function to create a file name based off the keyword
def paste_filename(search):
"""
Function that will create a name for the files to be saved to using the search
"""
# Removes any spaces
cleaned_keyword = search.replace(' ', '_')
# Adds 'videos.csv' at the end
filename = cleaned_keyword + "_videos.csv"
return filename
# MAIN PROGRAM
def search_video_extraction_program():
"""
Function starts the process of extracting videos from YouTube using a user-input search
I'm not sure of any restrictions when it comes to API calls from other modules in this program so I have a delay
of 2 seconds between each function call in the looping process and a 2 second delay between each URL. If this causes
and issue then it can certainly be changed.
"""
url_number = 1
# Takes a YouTube URL as input
input_keyword = input("Enter YouTube Search: ")
# Data used as each column
csv_column_names = ['keyword', 'url', 'title', 'description', 'views', 'published', 'likes', 'dislikes',
'channel_name', 'channel_url',
'channel_subscribers', 'transcription']
# Creates a file
print("Creating New CSV File...")
with open(paste_filename(input_keyword), 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(csv_column_names)
print("Getting YouTube URLs based off search...")
# Gets the list of URLs based off the keyword received
list_of_urls = get_youtube_urls(input_keyword)
# The number of URLS found
number_of_URLs = len(list_of_urls)
print(f"{number_of_URLs} YouTube video URLs found from search")
# Estimating Time
max_amount_of_time = (38 * number_of_URLs) / 60
min_amount_of_time = (14 * number_of_URLs) / 60
print(f"Estimated time: {round(min_amount_of_time, 0)} - {round(max_amount_of_time, 0)} minutes")
# Keeps track of the iteration of the URLS
list_of_urls_index_counter = 0
# Gets time stared
start_time = time.localtime()
for url in list_of_urls:
print(f"Starting URL {url_number}...")
# Extracts the video information
youtube_video_info = get_video_info(url)
time.sleep(randint(4, 10))
# Gets the YouTube transcriptions
clean_transcription = get_transcription(url)
time.sleep(randint(5, 13))
# Stores them as a youtubevideo object
yt_v = youtubevideo(
url=list_of_urls[list_of_urls_index_counter],
title=youtube_video_info['title'],
description=youtube_video_info['description'],
views=youtube_video_info['views'],
published=youtube_video_info['date_published'],
likes=youtube_video_info['likes'],
dislikes=youtube_video_info['dislikes'],
channel_name=youtube_video_info['channel']['name'],
channel_url=youtube_video_info['channel']['url'],
channel_subscribers=youtube_video_info['channel']['subscribers'],
transcription=clean_transcription
)
# Increments index counter
list_of_urls_index_counter += 1
# Creates the row in which will be appended to the CSV file
csv_file_rows = (input_keyword,
yt_v.url,
yt_v.title,
yt_v.description,
yt_v.views,
yt_v.published,
yt_v.likes,
yt_v.dislikes,
yt_v.channel_name,
yt_v.channel_url,
yt_v.channel_subscribers,
yt_v.transcription)
# Appends the CSV file with the found video information/transcripts
with open(paste_filename(input_keyword), 'a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(csv_file_rows)
print(f"{url_number} / {number_of_URLs} YouTube URLs Complete.")
# Increments URL number
url_number += 1
time.sleep(randint(5, 15))
# Gets time at end
end_time = time.localtime()
# Amount of time passed
time_passed = end_time[4] - start_time[4]
print(f"Complete - Duration: ~{time_passed} minutes ")
if __name__ == '__main__':
print("Welcome to the YouTube Transcription Program found at https://github.com/wesley4546/youtubetranscription")
print("Feel free to make an issue on GitHub if you find a bug or have a suggestion")
search_video_extraction_program()