Skip to content

Commit

Permalink
Merge pull request #58 from bukosabino/develop-black-format
Browse files Browse the repository at this point in the history
black format
  • Loading branch information
bukosabino authored Feb 4, 2024
2 parents 2c4f0b9 + 9edcb1b commit 5fd7e33
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 184 deletions.
20 changes: 8 additions & 12 deletions src/etls/bocm/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


# REGEX
CVE_REGEX = r'^BOCM-\d{8}-\d{1,3}$' # TODO: regex demasiado laxa
CVE_REGEX = r"^BOCM-\d{8}-\d{1,3}$" # TODO: regex demasiado laxa


class BOCMMetadataDocument(MetadataDocument):
Expand All @@ -22,32 +22,29 @@ class BOCMMetadataDocument(MetadataDocument):
source_type: str = "Boletin"

# Metadatos
identificador: str = Field(pattern=CVE_REGEX, examples=['BOCM-20240129-24'])
numero_oficial: str = "" # Número de boletín
identificador: str = Field(pattern=CVE_REGEX, examples=["BOCM-20240129-24"])
numero_oficial: str = "" # Número de boletín
paginas: str
departamento: str # órgano (excepto sección 4, que no tiene)
departamento: str # órgano (excepto sección 4, que no tiene)


seccion_normalizada: str
seccion: str
subseccion: str
tipo: str = ""
apartado: str = ""
rango: str = ""

# Links
titulo: str # title
url_pdf: str # pdf_link
url_html: str # html_link

# Links
titulo: str # title
url_pdf: str # pdf_link
url_html: str # html_link

fecha_publicacion: str
fecha_disposicion: str = ""
anio: str
mes: str
dia: str


datetime_insert: str = datetime.utcnow().isoformat()

@field_validator("fecha_publicacion", "fecha_disposicion")
Expand All @@ -56,4 +53,3 @@ def isoformat(cls, v):
if v:
return datetime.strptime(v, "%Y-%m-%d").strftime("%Y-%m-%d")
return v

95 changes: 43 additions & 52 deletions src/etls/bocm/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,38 +23,39 @@ def _adapt_link_to_complete_summary(url: str) -> str:
:url: url to transform. Example : https://www.bocm.es/boletin/bocm-20240126-22
:return: summary of the day url. Example: https://www.bocm.es/boletin-completo/BOCM-20240126/22
"""
tmp_str = url.replace("boletin","boletin-completo").replace("/bocm","/BOCM")
res = re.sub(r'(\d)-(\d+)', r"\1/\2", tmp_str)
"""
tmp_str = url.replace("boletin", "boletin-completo").replace("/bocm", "/BOCM")
res = re.sub(r"(\d)-(\d+)", r"\1/\2", tmp_str)
return res


# get url from response redirection
def _get_summary_link_from_date(day: date) -> str:
"""Get summary url from response redirection
:day: day format for request param: '%d/%m/%Y'
:return: summary of the day url
"""
"""
logger = lg.getLogger(_get_summary_link_from_date.__name__)

search_url = 'https://www.bocm.es/search-day-month'

try:
response = requests.post(search_url, data={ 'field_date[date]' : day})
search_url = "https://www.bocm.es/search-day-month"

try:
response = requests.post(search_url, data={"field_date[date]": day})
response.raise_for_status()
link = response.headers['Link'].split(';')[0].replace("<","").replace(">","")
if (re.search('search-day-month', link)):
raise ValueError('No link published')
link = response.headers["Link"].split(";")[0].replace("<", "").replace(">", "")
if re.search("search-day-month", link):
raise ValueError("No link published")
else:
final_url = _adapt_link_to_complete_summary(link)
final_url = _adapt_link_to_complete_summary(link)

except HTTPError:
logger.error("No link got on day %s", day)
final_url = None
logger.error("No link got on day %s", day)
final_url = None

except ValueError as err:
logger.error("%s for day %s. Skiping...", err.args[0],day)
final_url = None
logger.error("%s for day %s. Skiping...", err.args[0], day)
final_url = None

return final_url

Expand All @@ -63,47 +64,41 @@ def _extract_metadata(soup) -> tp.Dict:
metadata_dict = {}

# Metadata from head tags
fecha_publicacion,cve,html_link = metadata_from_head_tags(soup)
fecha_publicacion, cve, html_link = metadata_from_head_tags(soup)

# Desc doc header
numero_oficial,seccion_normalizada,paginas,pdf_link = metadata_from_doc_header(soup)
numero_oficial, seccion_normalizada, paginas, pdf_link = metadata_from_doc_header(soup)

# Metadata from document
seccion = seccion_normalizada.split('.')[0]
subseccion,apartado,tipo,organo,anunciante,rango = metadata_from_doc(soup,seccion,cve)
# Metadata from document
seccion = seccion_normalizada.split(".")[0]
subseccion, apartado, tipo, organo, anunciante, rango = metadata_from_doc(soup, seccion, cve)

metadata_dict["rango"] = rango
metadata_dict["identificador"] = cve
metadata_dict["numero_oficial"] = numero_oficial
metadata_dict["paginas"] = paginas

# departamento always match with organo
metadata_dict["departamento"] = organo

metadata_dict["seccion_normalizada"] = seccion_normalizada
metadata_dict["seccion"] = seccion.upper()
metadata_dict["subseccion"] = subseccion
metadata_dict["tipo"] = tipo
metadata_dict["apartado"] = apartado

metadata_dict["titulo"] = cve
metadata_dict["url_pdf"] = pdf_link
metadata_dict["url_html"] = html_link

metadata_dict["fecha_publicacion"] = fecha_publicacion
metadata_dict["fecha_disposicion"] = fecha_publicacion

metadata_dict["anio"] = datetime.strptime(
fecha_publicacion, "%Y-%m-%d"
).strftime("%Y")

metadata_dict["mes"] = datetime.strptime(
fecha_publicacion, "%Y-%m-%d"
).strftime("%m")
metadata_dict["anio"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%Y")

metadata_dict["mes"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%m")

metadata_dict["dia"] = datetime.strptime(
fecha_publicacion, "%Y-%m-%d"
).strftime("%d")
metadata_dict["dia"] = datetime.strptime(fecha_publicacion, "%Y-%m-%d").strftime("%d")

return metadata_dict

Expand All @@ -112,17 +107,17 @@ def _list_links_day(url: str) -> tp.List[str]:
"""Get a list of links in a BOCM url day filtering by Seccion 1-A, 3 and 4.
:param url: summary url link. Example: https://www.bocm.es/boletin-completo/BOCM-20240103/2
:return: list of urls filtered by sections to download
:return: list of urls filtered by sections to download
"""
logger = lg.getLogger(_list_links_day.__name__)

logger.info("Scrapping day: %s", url)
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text,features="lxml")
soup = BeautifulSoup(response.text, features="lxml")

# filter by sections
sections_to_filter = ['1-A','3-','4-']
sections_to_filter = ["1-A", "3-", "4-"]
filtered_links = filter_links_by_section(soup, sections_to_filter)
logger.info("Scrapped day successfully %s (%s BOCM documents)", url, len(filtered_links))

Expand All @@ -135,30 +130,26 @@ def download_day(self, day: date) -> tp.List[BOCMMetadataDocument]:
logger = lg.getLogger(self.download_day.__name__)
logger.info("Downloading BOCM content for day %s", day)
day_str = day.strftime("%d/%m/%Y")

summary_url = _get_summary_link_from_date(day_str)

metadata_documents = []
if(summary_url is not None):
if summary_url is not None:
logger.info("Got summary url for day %s", day)
logger.info("URL: [%s] for selected day [%s]", summary_url, day)

try:
list_urls = _list_links_day(summary_url)
for url in list_urls:
try:
# Skip urls that contains in the path 'boletin'
if (not re.search('boletin',url)):
if not re.search("boletin", url):
metadata_doc = self.download_document(url)
metadata_documents.append(metadata_doc)
except HTTPError:
logger.error(
"Not scrapped document %s on day %s", url, day
)
logger.error("Not scrapped document %s on day %s", url, day)
except AttributeError:
logger.error(
"Not scrapped document %s on day %s", url, day
)
logger.error("Not scrapped document %s on day %s", url, day)
except HTTPError:
logger.error("Not scrapped document on day %s", day_url)
logger.info("Downloaded all BOCM docs for day %s", day)
Expand All @@ -176,7 +167,7 @@ def download_document(self, url: str) -> BOCMMetadataDocument:
logger.info("Scrapping document: %s", url)
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text,features="lxml")
soup = BeautifulSoup(response.text, features="lxml")
with tempfile.NamedTemporaryFile("w", delete=False) as fn:
text = soup.select_one("#main").get_text()
text_cleaned = clean_text(text)
Expand Down
Loading

0 comments on commit 5fd7e33

Please sign in to comment.