-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcep_finder.py
80 lines (70 loc) · 2.83 KB
/
cep_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import requests
from bs4 import BeautifulSoup
from unicodedata import normalize
class CepFinder:
def __init__(self, address, number, city, uf, verbose=False):
self.address = address
self.number = number
self.city = city
self.uf = uf
self.verbose = verbose
def _generate_json_data(self) -> dict:
'''
Generate the json data to be sent to the Correios website.
'''
return {
'UF': self.uf,
'Localidade': self._replace_diacritics(self.city),
'Tipo': '',
'Logradouro': self._replace_diacritics(self.address),
'Numero': self.number
}
def _generate_complete_address_string(self) -> str:
'''
Generate the complete address string to be sent to Mapacep website.
'''
return f'{self.address} {self.number} {self.city} {self.uf}'
def _replace_diacritics(string) -> str:
'''
Replace a character with a diacritic with the same character without diacritic.
Input: 'Álvaro'
Output: 'Alvaro'
'''
return normalize('NFKD', string).encode('ASCII', 'ignore').decode('ASCII')
def _validate_cep(self, cep, doc):
if re.match('^\d{5}-\d{3}$', cep):
return cep
elif self.verbose:
print('Error while scraping the request')
return None
else:
return None
def find_cep_mapacep(self) -> str:
'''
Get the CEP of an address using www.mapacep.com.br, limited to a few CEPs (not sure about the exact limit).
'''
data = {
'keywords': self._generate_complete_address_string(),
'submit': 'pesquisar'
}
url = "https://www.mapacep.com.br/"
response = requests.post(url, data=data)
doc = BeautifulSoup(response.text, 'html.parser')
docx = doc.find_all('p')
docx_string = str(docx[2]) # Select the second <p> tag, where the CEP is located
lens = len(docx_string)
cep = docx_string[(lens-21):(lens-12)] # Get the CEP through reverse slicing
return self._validate_cep(cep, doc)
def find_cep_correios_website(self) -> str:
'''
Deprecated as the original endpoint was shutdown by Correios.
Scrapes CEP by address from the Correios Official Website. The website blocks scraping after ~300 requests
'''
data = self._generate_json_data()
url = "http://www.buscacep.correios.com.br/sistemas/buscacep/resultadoBuscaCep.cfm"
response = requests.post(url, data=data)
doc = BeautifulSoup(response.text, 'html.parser')
tags = doc.find_all('td')
cep = tags[3].text
return self._validate_cep(cep, doc)