-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.sh
113 lines (93 loc) · 3.35 KB
/
scraper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#/bin/bash
meses="ene
feb
mar
abr
may
jun
jul
ago
sep
oct
nov
dic"
anhos="2019"
# 2018
# 2017"
letras="ab
cd
efg
hijkl
mno
pqr
stu
vwxyz"
link="http://web.uchile.cl/transparencia/"
# las rentas de agosto tienen problemas con los acentos
for tipo in "planta" "contrata"; do
for anho in $anhos; do
for mes in $meses; do
touch "datos/${anho}${mes}_${tipo}.tsv"
echo -e "\n${tipo} ${mes} ${anho}"
for let in $letras;do
# https://stackoverflow.com/questions/1403087/how-can-i-convert-an-html-table-to-csv
echo -e '\t' "Scraping:" "${link}${tipo}${mes}${anho}${let}.html"
curl "${link}${tipo}${mes}${anho}${let}.html" 2>/dev/null | \
sed "0,/<tbody>/Id" | \
grep -i -e '</\?TABLE\|</\?TD\|</\?TR\|</\?TH' |\
sed 's/^[\ \t]*//g' | tr -d '\n' | \
sed 's/<\/TR[^>]*>/\n/Ig' | \
sed 's/<\/\?\(TABLE\|TR\|TBODY\)[^>]*>//Ig' |\
sed 's/^<T[DH][^>]*>\|<\/\?T[DH][^>]*>$//Ig' |\
sed 's/<\/T[DH][^>]*><T[DH][^>]*>/\t/Ig' | \
sed 's/á/á/g; s/é\;/é/g; s/í/í/g; s/ó/ó/g; s/ú/ú/g; s/ñ/ñ/g; s/Á/Á/g; s/É\;/É/g; s/Í/Í/g; s/Ó/Ó/g; s/Ú/Ú/g; s/Ñ/ñ/g;' \
>> "datos/${anho}${mes}_${tipo}.tsv"
done
done
done
done
for anho in $anhos; do
for mes in $meses; do
echo -e "\nhonorarios ${mes} ${anho}"
for let in "abc" "defghijkl" "mnopqr" "stuvwxyz" ;do
touch "datos/${mes}${anho}_honorarios.tsv"
echo -e '\t' "Scraping:" "${link}honorarios${mes}${anho}${let}.html"
curl "${link}honorarios${mes}${anho}${let}.html" 2>/dev/null | \
sed "0,/<tbody>/Id" | \
grep -i -e '</\?TABLE\|</\?TD\|</\?TR\|</\?TH' |\
sed 's/^[\ \t]*//g' | tr -d '\n' | \
sed 's/<\/TR[^>]*>/\n/Ig' | \
sed 's/<\/\?\(TABLE\|TR\|TBODY\)[^>]*>//Ig' |\
sed 's/^<T[DH][^>]*>\|<\/\?T[DH][^>]*>$//Ig' |\
sed 's/<\/T[DH][^>]*><T[DH][^>]*>/\t/Ig' | \
sed 's/á/á/g; s/é\;/é/g; s/í/í/g; s/ó/ó/g; s/ú/ú/g; s/ñ/ñ/g; s/Á/Á/g; s/É\;/É/g; s/Í/Í/g; s/Ó/Ó/g; s/Ú/Ú/g; s/Ñ/ñ/g;' \
>> "datos/${anho}${mes}_honorarios.tsv"
done
done
done
mkdir datos_limpios
touch "datos_limpios/sueldos.tsv"
touch "datos_limpios/personal.tsv"
for tipo in "planta" "contrata"; do
for mes in $meses; do
for anho in $anhos; do
# eliminar filas vacias
sed -i '/^\s\s*/d' "datos/${anho}${mes}_${tipo}.tsv"
# eliminar columnas innecesarias | agregar anho y mes a los datos
# cut -d$'\t' -f9,11 --complement "datos/${anho}${mes}_${tipo}.tsv" | awk -v anho=$anho -v mes=$mes '{print anho "\t" mes "\t" $0}' >> "datos_limpios/sueldos.tsv"
cut -d$'\t' -f1-2,6-17 --complement "datos/${anho}${mes}_${tipo}.tsv" >> datos_limpios/personal_tmp.tsv
done
done
done
touch "datos_limpios/honorarios.tsv"
for mes in $meses; do
for anho in $anhos; do
# eliminar filas vacias
sed -i 's/"//g' "datos/${anho}${mes}_honorarios.tsv"
sed -i '/^\s\s*/d' "datos/${anho}${mes}_honorarios.tsv"
# eliminar columnas innecesarias | agregar anho y mes a los datos
cut -d$'\t' -f6-8 --complement "datos/${anho}${mes}_honorarios.tsv" | awk -v anho=$anho -v mes=$mes '{print "HONORARIOS" "\t" anho "\t" mes "\t" $0}' >> "datos_limpios/honorarios.tsv"
done
done
sort -u datos_limpios/personal_tmp.tsv > datos_limpios/personal.tsv
rm datos_limpios/personal_tmp.tsv