-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgcloud.R
55 lines (37 loc) · 1.65 KB
/
gcloud.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
library(rvest)
library(tidyverse)
library(stringr)
## Scsript to extract a table of G-Cloud suppliers to make it easier to find relevant ones
url <- "https://www.digitalmarketplace.service.gov.uk/g-cloud/suppliers"
page <- read_html(url) %>% html_nodes("a") %>% html_attr("href")
gs <- page[grepl(pattern = "/g-cloud/supplier/", page)]
gsurls <- map(gs, function(x) paste0("https://www.digitalmarketplace.service.gov.uk", x))
## function for extracting titles or text from urls
exText <- function(url){
page <- read_html(url) %>% html_nodes("a") %>% html_attr("href")
gs <- page[grepl(pattern = "/g-cloud/supplier/", page)]
gsurls <- map(gs, function(x) paste0("https://www.digitalmarketplace.service.gov.uk", x))
}
test <- exText(url)
## URLs
suppliers_ABC <- paste0("https://www.digitalmarketplace.service.gov.uk/g-cloud/suppliers?prefix=",LETTERS)
## extract supplier ids
listsup <- suppliers_ABC %>%
map(exText) %>% unlist()
## for loop to create datatable of suppliers names, emails and descriptions
df <- data.frame()
for(i in seq_along(listsup)){
l <- listsup[i] %>% read_html() %>% html_nodes("h1") %>% html_text()
l <- gsub("\\n", "", l)
supplier_name <- tm::stripWhitespace(l)
l1 <- listsup[i] %>% read_html() %>% html_nodes(".supplier-description") %>% html_text()
l1 <- gsub("\\n", "", l1)
supplier_details <- tm::stripWhitespace(l1)
l2 <- listsup[i] %>% read_html() %>% html_nodes("a") %>% html_text()
supplier_email <- l2[grepl("@", l2)][2]
df1 <- data.frame(cbind(supplier_name, supplier_details, supplier_email ))
df <- bind_rows(df, df1)
}
## datatable
df %>%
DT::datatable(filter = "top", caption = "G-Cloud suppliers" )