-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathBiopharmCatalyst_Download.py
119 lines (70 loc) · 2.77 KB
/
BiopharmCatalyst_Download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 20 13:00:49 2020
@author: Tejas
Stealing BioPharamCatalyst data :P
Step 1: Go to https://www.biopharmcatalyst.com/calendars/historical-catalyst-calendar
Step 2: Right click, and go to view source HTML.
Step 3: Copy the whole data and paste it in a notepad file and name it "CatalystHistory.txt"
Step 4: Just run the below code and enjoy :P
"""
#### Read their whole view-source html file.
f=open('catalysthistory.txt', encoding="utf8")
x = f.readlines()
#### Now, start with this string and then record each element.
#check if line contains = "https://www.biopharmcatalyst.com/company"
import re
check_string = 'https://www.biopharmcatalyst.com/company'
#### Start all the variables we want to record
ticker = []
drug = []
indication = []
AppCRL = []
Cdate = []
Cdescription = []
for i in range(0, len(x)):
if check_string in x[i]:
#try:
#Ticker
start = "\"ticker\">"
end = '</'
ticker.append(re.search('%s(.*)%s' % (start, end), x[i]).group(1)[:-4])
#Drug Name
start = "\"drug\">"
end = '</'
drug.append(re.search('%s(.*)%s' % (start, end), x[i+2]).group(1))
#Indication
start = "\"indication\">"
end = '</div>'
indication.append(re.search('%s(.*)%s' % (start, end), x[i+3]).group(1))
#Approved or CRL
start = " "
end = '\n'
AppCRL.append(re.search('%s(.*)%s' % (start, end), x[i+7]).group(1))
try:
#Catalyst Date
start = " \">"
end = '</time>'
Cdate.append(re.search('%s(.*)%s' % (start, end), x[i+21]).group(1))
#Catalyst Description
start = "-note\">"
end = '</div>'
Cdescription.append(re.search('%s(.*)%s' % (start, end), x[i+23]).group(1))
except:
#Catalyst Date
start = " \">"
end = '</time>'
Cdate.append(re.search('%s(.*)%s' % (start, end), x[i+20]).group(1))
#Catalyst Description
start = "-note\">"
end = '</div>'
Cdescription.append(re.search('%s(.*)%s' % (start, end), x[i+22]).group(1))
import pandas as pd
final = pd.DataFrame()
final['Ticker'] = ticker
final['Drug Name'] = drug
final['Indication'] = indication
final['Approved or CRL'] = AppCRL
final['Catalyst Date'] = Cdate
final['Catalyst Description'] = Cdescription
final.to_csv('BioPharmCatalyst.csv', index = False)