-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcourseGuide.py
100 lines (87 loc) · 5.33 KB
/
courseGuide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
### Author: Ahmad Al-Dajani
### Description: This Program is intended to retrieve the course schedule for Georgetown University Law Center's ("GULC") Curriculum Guide,
### and turn it into a manageable excel file that can be easily sorted according to credit, exam date, instructor, course type, time etc...
### Help: Professor Paul Ohm's Computer Programming for Lawyers Course, taught at GULC. I had ZERO programming experience prior to this class,
### and it enabled me to develop sufficient proficiency in python programming to create this program and make my life easier
### Dependencies: requests, beautifulSoup 4, urllib, re, csv
### Time: 2-3 Hours - to create a program that sorted through many courses, and made the course selection process simple and convenient.
### DISCLAIMER: Beware of sorting the spreadsheet by exam date as there are too many outliers for a consistent result. For the most part, dates and designations are accurate as they pretain to courses
import requests
import bs4
import urllib.parse
import re
import csv
# This is the URL for the GULC Curriculum Guide along with parameters
url = 'http://apps.law.georgetown.edu/curriculum/tab_schedules.cfm'
parameters = {'Status':'Schedule',
'Type':'Letter',
'Program': 'JD', # This controls the program, you can switch the to 'LLM' instead of 'JD' to get schedules for LLM Programs
'Year':39,
'Term':'2017C', # This Controls the Semester - Summer Semesters are usually in this format'YYYYB'; Fall is YYYYC; Spring is YYYYA
'Letter':'ALL', # This sorts the course by letter, I chose to view all courses available
'Writing': 1 # to change from writing course schedule to exam course schedule simply comment this line out. DONT FORGET TO REMOVE THE PRECEDING COMMA IF YOU COMMENT THIS LINE OUT.
}
# Create a CSV file named Course Guide (Term) and save it in the working directory
if parameters['Term'] == '2018A':
if 'Writing' in parameters.keys():
filename = 'CourseGuideGULCSpring2018WR.csv'
else:
filename = 'CourseGuideGULCSpring2018EX.csv'
elif parameters['Term'] == '2017C':
if 'Writing' in parameters.keys():
filename = 'CourseGuideGULCFall2017WR.csv'
else:
filename = 'CourseGuideGULCFall2017EX.csv'
fp = open(filename, 'w', newline='', encoding = 'latin-1')
# Read the CSV file
reader = csv.reader(fp)
# These are the headings that I would like to input into the first row of the newly created CSV
fieldNames = ['Course No.', 'CRN', 'Course Title', 'CR.', 'Faculty', 'Days', 'From-To', 'Exam/Paper']
# Write the headings into the file
headingWriter = csv.DictWriter(fp, fieldNames)
headingWriter.writeheader()
# Go to the URL with the above parameters and get the HTML from that URL, then store it as response variable
response = requests.get(url, params=parameters)
# Have Beautiful Soup work its magic on the URL
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# If you inspect the curriculum guide page, this is where you will find the schedule table <table #schedMain
# Select table rows from that table
rows = soup.select('tr')
for row in rows:
try:
# I had to skip an exception because the first tr is for table headers, as such it does not contain 'td' but 'th'
cols = row.findAll("td")
# trial and error allows you to figure out what the index for each column in the table is
courseNo = cols[0].get_text()
# the string retrieved from the first column contains some unnecssary text, so I narrowed it down using a Regex
matchJDnum = re.match(r'\w+\-\d+\-\d+', courseNo)
JDNum = matchJDnum.group(0)
# Course Registration Numbers are in a seperate division within a division
crn = cols[0].find('div').string
crnMatch = re.search(r'\(.+?\)', crn)
RegNo = crnMatch.group(0)
# Course headings are available as a string in an aTag
courseTitle = cols[1].find('a').string
# Course Credits
credit = cols[2].get_text()
# Course Instructors
instructors = cols[3].get_text()
teachers = re.match(r'.*', instructors, flags=re.MULTILINE|re.DOTALL)
professors = teachers.group(0)
# Days for course
day = cols[5].get_text()
# Time for each course are within a seperate tag called <nobr>
time = cols[6].find('nobr').string
# Time String retrieved was too long to stick into a CSV field, so narrow it down to the minimum requirement
timeMatch = re.match(r'\d+\:\d+\-\d+\:\d+', time, re.MULTILINE)
courseTime = timeMatch.group(0)
# Get the course classification or exam date
classification = cols[7].text
removeSpaces = re.sub(r"(\s+)", "", classification, flags=re.MULTILINE)
# Gather all the information we retrieved, and stick it into a list, then write it to the csv file we created earlier.
dataList = [JDNum, RegNo, courseTitle, credit, professors, day, courseTime, removeSpaces]
writer = csv.writer(fp)
writer.writerow(dataList)
except Exception:
pass
fp.close()