-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlers.go
226 lines (204 loc) · 6.76 KB
/
crawlers.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
package main
import (
"bufio"
"encoding/json"
"io/ioutil"
"net/http"
"net/url"
"os"
"os/exec"
"strings"
"github.com/pkg/errors"
"github.com/thecsw/katya/log"
"github.com/thecsw/katya/storage"
"github.com/thecsw/katya/utils"
"gorm.io/gorm"
)
// crawlerActionPayload is the POST body of crawler actions
type crawlerActionPayload struct {
// Link is the crawler's link
Link string `json:"link"`
// Label is just user-created custom text
Label string `json:"label"`
// Enabled flags if something is disabled (defaults to false -> enabled)
Disabled bool `json:"disabled"`
// OnlySubpaths tells us if we only do subdirectories of the link
OnlySubpaths bool `json:"only_subpaths"`
}
// crawlerCreator creates a crawler
func crawlerCreator(w http.ResponseWriter, r *http.Request) {
payload := &crawlerActionPayload{}
decoder := json.NewDecoder(r.Body)
err := decoder.Decode(payload)
if err != nil {
log.Error("Failed decoding a crawler creator payload", err, nil)
return
}
user := r.Context().Value(ContextKey("user")).(storage.User)
thisLogParams := log.Params{
"user": user.Name,
"link": payload.Link,
}
name, err := allocateCrawler(user.Name, payload.Link, payload.OnlySubpaths)
if err != nil {
log.Error("Failed allocating a crawler in creator payload", err, thisLogParams)
httpJSON(w, nil, http.StatusInternalServerError, err)
return
}
httpJSON(w, httpMessageReturn{"created crawler: " + name}, http.StatusOK, nil)
}
// crawlerRunner triggers a crawler
func crawlerRunner(w http.ResponseWriter, r *http.Request) {
payload := &crawlerActionPayload{}
decoder := json.NewDecoder(r.Body)
err := decoder.Decode(payload)
if err != nil {
log.Error("Failed decoding a crawler trigger payload", err, nil)
httpJSON(w, nil, http.StatusBadRequest, err)
return
}
user := r.Context().Value(ContextKey("user")).(storage.User)
thisLogParams := log.Params{
"user": user.Name,
"link": payload.Link,
}
name, err := triggerCrawler(user.Name, payload.Link)
if err != nil {
log.Error("Failed triggering a crawler in creator payload", err, thisLogParams)
httpJSON(w, nil, http.StatusInternalServerError, err)
return
}
httpJSON(w, httpMessageReturn{"triggered crawler: " + name}, http.StatusOK, nil)
}
func crawlerStatusReceiver(w http.ResponseWriter, r *http.Request) {
crawlerName := r.URL.Query().Get("name")
if crawlerName == "" {
httpJSON(w, nil, http.StatusBadRequest, errors.New("empty crawler name"))
return
}
val, err := storage.GetLastScrape(crawlerName)
if err != nil {
httpJSON(w, nil, http.StatusInternalServerError, errors.Wrap(err, "getting last scrape"))
return
}
httpJSON(w, *val, http.StatusOK, nil)
}
// genCrawlerName takes a user and their link and returns a *guaranteed*
// unique name for a new crawler
func genCrawlerName(user, link string) string {
return user + "-" + utils.ShaEncode(link)[:10]
}
// allocateCrawler actually tries to fully allocate and write a new crawler to disk
func allocateCrawler(user, link string, onlySubpaths bool) (string, error) {
// Our name is going to be some UUID
name := genCrawlerName(user, link)
// Create params for logging purposes
thisParams := log.Params{
"name": name,
"user": user,
"url": link,
"only_subpaths": onlySubpaths,
}
// Check if a crawler already exists, if it doesn't,
// then create one and use it later to trigger it
crawlerExists, err := storage.IsCrawler(name)
if err != nil && err != gorm.ErrRecordNotFound {
log.Error("failed existence allocating", err, thisParams)
return "", errors.Wrap(err, "failed existence allocating")
}
// Create a crawler if one doesn't exist
if !crawlerExists {
err = storage.CreateCrawler(name, user, link)
if err != nil {
log.Error("failed creating allocating", err, thisParams)
return "", errors.Wrap(err, "failed creating allocating")
}
}
// Parse the url to retrieve domain
parsedURL, err := url.Parse(link)
if err != nil {
log.Error("bad parsing of the source", err, thisParams)
return "", errors.Wrap(err, "bad parsing of the source")
}
// Get the actual domain
domain := parsedURL.Host
// Write the scrapy python text file
err = writeNewCrawler(name, domain, link, onlySubpaths)
if err != nil {
log.Error("failed to write a crawler script", err, thisParams)
return "", errors.Wrap(err, "failed to write a crawler script")
}
log.Format("Successfully allocated a new crawler", thisParams)
return name, nil
}
// triggerCrawler actually triggers the saved crawler to start feeding texts
func triggerCrawler(user, link string) (string, error) {
// Our name is going to be some UUID
name := genCrawlerName(user, link)
// Create params for logging purposes
thisParams := log.Params{
"name": name,
"user": user,
"url": link,
}
// Check if a crawler already exists, if it doesn't,
// then create one and use it later to trigger it
crawlerExists, err := storage.IsCrawler(name)
if err != nil && err != gorm.ErrRecordNotFound {
log.Error("failed existence allocating", err, thisParams)
return "", errors.Wrap(err, "failed existence allocating")
}
if !crawlerExists {
err := errors.New("this crawler needs to be allocated first")
log.Error("this crawler needs to be allocated first, allocating", err, thisParams)
name, err = allocateCrawler(user, link, true)
if err != nil {
log.Error("failed hot allocating a crawler", err, thisParams)
return "", err
}
}
log.Format("Triggering a crawler", thisParams)
scrapyCmd := exec.Command("scrapy", "crawl", name)
scrapyCmd.Dir = ScrapyDir
logFile, err := os.Create(LogsDir + name + ".log")
if err != nil {
log.Error("Couldn't create a log file for a new scraper. Logs will be lost", err, thisParams)
return "", err
}
logWriter := bufio.NewWriter(logFile)
scrapyCmd.Stdout = logWriter
scrapyCmd.Stderr = logWriter
// Run the process in the background
go func() {
err := scrapyCmd.Run()
if err != nil {
log.Error("FAILED TO START SCRAPY", err, log.Params{"log_file": logFile.Name()})
return
}
// close the file
err = logFile.Close()
if err != nil {
log.Error("FAILED CLOSE SCRAPY LOG FILE", err, log.Params{"log_file": logFile.Name()})
return
}
}()
// Run is blocking, Start is non-blocking
//return name, scrapyCmd.Run()
return name, nil
}
// writeNewCrawler copies the template scrapy python script into our own
// and updates its settings, like only subpaths or not
func writeNewCrawler(name, domain, url string, onlySubpaths bool) error {
myCrawler := templateCrawler
myCrawler = strings.ReplaceAll(myCrawler, "<NAME>", name)
myCrawler = strings.ReplaceAll(myCrawler, "<DOMAIN>", domain)
myCrawler = strings.ReplaceAll(myCrawler, "<START>", url)
if onlySubpaths {
myCrawler = strings.ReplaceAll(myCrawler, "# nosubpath", "")
}
return ioutil.WriteFile(
CrawlersDir+name+".py",
[]byte(myCrawler),
0600,
)
}