Skip to content

Commit

Permalink
Adiciona novos parâmetros para o QD #171
Browse files Browse the repository at this point in the history
  • Loading branch information
slfabio committed Feb 17, 2025
1 parent 9e58861 commit 1fa2d43
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 50 deletions.
13 changes: 9 additions & 4 deletions dag_confs/examples_and_tests/qd_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@ dag:
description: DAG de teste
search:
sources:
- QD
- QD
territory_id: 3106200
terms:
- pandemia
- dados pessoais
- prefeitura
- pandemia
- prefeitura
- '"dados pessoais"'
- '"Software+valor"~100'
- '"\"Alimentação escolar\"+valor"~100'
is_exact_search: False
number_of_excerpts: 5
excerpt_size: 500
report:
emails:
- destination@economia.gov.br
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ dag:
terms:
- LGPD
- RIO DE JANEIRO
force_rematch: On
ignore_signature_match: On
- DADOS PESSOAIS
is_exact_search: True
number_of_excerpts: 5
excerpt_size: 500
report:
emails:
- destination@economia.gov.br
Expand Down
11 changes: 8 additions & 3 deletions docs/docs/como_funciona/exemplos.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,14 @@ dag:
- QD
territory_id: 3106200 # Belo Horizonte
terms:
- pandemia
- dados pessoais
- prefeitura
- pandemia
- prefeitura
- '"dados pessoais"'
- '"Software+valor"~100'
- '"\"Alimentação escolar\"+valor"~100'
is_exact_search: False
number_of_excerpts: 5
excerpt_size: 500
report:
emails:
- destination@gestao.gov.br
Expand Down
2 changes: 2 additions & 0 deletions docs/docs/como_funciona/parametros.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ A página abaixo lista os parâmetros configuráveis nos arquivos YAML:
- **sources**: Fontes de pesquisa dos diários oficiais. Pode ser uma ou uma lista. Opções disponíveis: DOU, QD, INLABS.
- **terms**: Lista de termos a serem buscados. Para o INLABS podem ser utilizados operadores avançados de busca.
- **territory_id**: Lista de identificadores do id do município. Necessário para buscar no Querido Diário.
- **excerpt_size**: Número máximo de caracteres exibidos no trecho onde o termo de busca foi localizado. (Funcionalidade disponível apenas no Querido Diário)
- **number_of_excerpts**: Número máximo de ocorrências do termo de busca em uma mesma edição. (Funcionalidade disponível apenas no Querido Diário)

## Parâmetros do Relatório (Report)
- **attach_csv**: Anexar no email o resultado da pesquisa em CSV.
Expand Down
11 changes: 6 additions & 5 deletions src/dou_dag_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ def perform_searches(
result_as_email: Optional[bool],
department: List[str],
pubtype: List[str],
excerpt_size: Optional[int],
number_of_excerpts: Optional[int],
**context,
) -> dict:
"""Performs the search in each source and merge the results"""
Expand Down Expand Up @@ -312,13 +314,10 @@ def perform_searches(
qd_result = self.searchers["QD"].exec_search(
territory_id=territory_id,
term_list=term_list,
dou_sections=dou_sections,
search_date=search_date,
field=field,
is_exact_search=is_exact_search,
ignore_signature_match=ignore_signature_match,
force_rematch=force_rematch,
reference_date=get_trigger_date(context, local_time=True),
excerpt_size=excerpt_size,
number_of_excerpts=number_of_excerpts,
result_as_email=result_as_email,
)

Expand Down Expand Up @@ -491,6 +490,8 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
"use_summary": subsearch.use_summary,
"department": subsearch.department,
"pubtype": subsearch.pubtype,
"excerpt_size": subsearch.excerpt_size,
"number_of_excerpts": subsearch.number_of_excerpts,
"result_as_email": result_as_html(specs),
},
)
Expand Down
10 changes: 10 additions & 0 deletions src/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,16 @@ class SearchConfig(BaseModel):
pubtype: Optional[List[str]] = Field(
default=None, description="Lista de tipo de publicações para filtrar a pesquisa"
)
excerpt_size: Optional[int] = Field(
default=None,
description="Número máximo de caracteres exibidos no trecho onde o termo de busca foi localizado. "
"(Funcionalidade disponível apenas no Querido Diário)"
)
number_of_excerpts: Optional[int] = Field(
default=None,
description="Número máximo de ocorrências do termo de busca em uma mesma edição. "
"(Funcionalidade disponível apenas no Querido Diário)"
)


class ReportConfig(BaseModel):
Expand Down
65 changes: 44 additions & 21 deletions src/searchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,25 +342,23 @@ def exec_search(
self,
territory_id,
term_list,
dou_sections: List[str],
search_date,
field,
is_exact_search: bool,
ignore_signature_match: bool,
force_rematch: bool,
reference_date: datetime,
excerpt_size: int,
number_of_excerpts: int,
result_as_email: bool = True,
):
force_rematch = True if force_rematch is None else force_rematch
term_list = self._cast_term_list(term_list)
tailored_date = reference_date - timedelta(days=1)
search_results = {}
for search_term in term_list:
results = self._search_term(
territory_id=territory_id,
search_term=search_term,
is_exact_search=is_exact_search,
reference_date=tailored_date,
force_rematch=force_rematch,
excerpt_size=excerpt_size,
number_of_excerpts=number_of_excerpts,
result_as_email=result_as_email,
)
if results:
Expand All @@ -373,16 +371,20 @@ def _search_term(
self,
territory_id,
search_term,
is_exact_search,
reference_date,
force_rematch: bool,
excerpt_size,
number_of_excerpts,
result_as_email: bool = True,
) -> list:
payload = _build_query_payload(search_term, reference_date)

if territory_id:
if isinstance(territory_id, int): territory_id = [territory_id]
for terr_id in territory_id:
payload.append(("territory_ids", terr_id))
payload = _build_query_payload(
search_term,
is_exact_search,
reference_date,
territory_id,
excerpt_size,
number_of_excerpts
)

req_result = requests.get(self.API_BASE_URL, params=payload)

Expand Down Expand Up @@ -414,18 +416,39 @@ def parse_result(self, result: dict, result_as_email: bool = True) -> dict:
}


def _build_query_payload(search_term: str, reference_date: datetime) -> List[tuple]:
return [
("size", 100),
("excerpt_size", 250),
("sort_by", "descending_date"),
def _build_query_payload(
search_term: str,
is_exact_search: bool,
reference_date: datetime,
territory_id,
excerpt_size: int = 250,
number_of_excerpts: int = 3
) -> List[tuple]:
if is_exact_search:
search_term = f'"{search_term}"'

size = 100
payload_territory_id = []
if territory_id:
if isinstance(territory_id, int): territory_id = [territory_id]
for terr_id in territory_id:
payload_territory_id.append(("territory_ids", terr_id))
# Como a busca é realizada sempre em um única data,
# no resultado haverá no máximo 1 edição por município
size = len(territory_id)

payload = [
("size", size),
("excerpt_size", excerpt_size),
("sort_by", "relevance"),
("pre_tags", "<%%>"),
("post_tags", "</%%>"),
("number_of_excerpts", 3),
("number_of_excerpts", number_of_excerpts),
("published_since", reference_date.strftime("%Y-%m-%d")),
("published_until", reference_date.strftime("%Y-%m-%d")),
("querystring", f'"{search_term}"'),
("querystring", search_term),
]
return payload + payload_territory_id


class INLABSSearcher(BaseSearcher):
Expand Down
7 changes: 5 additions & 2 deletions tests/parsers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,7 @@
"terms": [
"LGPD",
"RIO DE JANEIRO",
"DADOS PESSOAIS"
],
"header": "Teste com múltiplos territory_id",
"sources": ["QD"],
Expand All @@ -615,12 +616,14 @@
"search_date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": True,
"force_rematch": True,
"ignore_signature_match": False,
"force_rematch": False,
"full_text": False,
"use_summary": False,
"department": None,
"pubtype": None,
"number_of_excerpts": 5,
"excerpt_size": 500,
}
],
"report": {
Expand Down
121 changes: 108 additions & 13 deletions tests/qd_searcher_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@ def test_build_query_payload(pre_tags: str,
post_tags: str):
payload = _build_query_payload(
search_term='paralelepípedo',
is_exact_search=True,
reference_date=datetime(2023, 2, 9),
territory_id=None,
)
expected = [
('size', 100),
('excerpt_size', 250),
('sort_by', 'descending_date'),
('sort_by', 'relevance'),
('pre_tags', pre_tags),
('post_tags', post_tags),
('number_of_excerpts', 3),
Expand All @@ -80,22 +82,115 @@ def test_build_query_payload(pre_tags: str,

assert payload == expected


@pytest.mark.parametrize(
'territory_id, expected_payload',
[
(3300100, [('territory_ids', 3300100)]),
([3300100, 3300159], [('territory_ids', 3300100), ('territory_ids', 3300159)]),
(
None,
[
('size', 100),
('excerpt_size', 250),
('sort_by', 'relevance'),
('pre_tags', "<%%>"),
('post_tags', "</%%>"),
('number_of_excerpts', 3),
('published_since', '2023-02-09'),
('published_until', '2023-02-09'),
('querystring', '"paralelepípedo"')
]
),
(
3303302,
[
('size', 1),
('excerpt_size', 250),
('sort_by', 'relevance'),
('pre_tags', "<%%>"),
('post_tags', "</%%>"),
('number_of_excerpts', 3),
('published_since', '2023-02-09'),
('published_until', '2023-02-09'),
('querystring', '"paralelepípedo"'),
('territory_ids', 3303302)
]
),
(
[3303302, 3303303],
[
('size', 2),
('excerpt_size', 250),
('sort_by', 'relevance'),
('pre_tags', "<%%>"),
('post_tags', "</%%>"),
('number_of_excerpts', 3),
('published_since', '2023-02-09'),
('published_until', '2023-02-09'),
('querystring', '"paralelepípedo"'),
('territory_ids', 3303302),
('territory_ids', 3303303)
]
),
]
)
def test_search_with_multiple_territory_ids(territory_id, expected_payload):
#searcher = QDSearcher()
payload = []
def test_build_query_payload_territory_id_and_size(territory_id, expected_payload):
payload = _build_query_payload(
search_term='paralelepípedo',
is_exact_search=True,
reference_date=datetime(2023, 2, 9),
territory_id=territory_id
)

assert payload == expected_payload

# Simula a lógica que foi alterada para suportar múltiplos IDs de território
if isinstance(territory_id, int):
territory_id = [territory_id]
for terr_id in territory_id:
payload.append(('territory_ids', terr_id))
@pytest.mark.parametrize(
'excerpt_size, number_of_excerpts, expected_payload',
[
(
500,
5,
[
('size', 100),
('excerpt_size', 500),
('sort_by', 'relevance'),
('pre_tags', "<%%>"),
('post_tags', "</%%>"),
('number_of_excerpts', 5),
('published_since', '2023-02-09'),
('published_until', '2023-02-09'),
('querystring', '"paralelepípedo"')
]
)
]
)
def test_build_query_payload_excerpt_params(excerpt_size, number_of_excerpts, expected_payload):
payload = _build_query_payload(
search_term='paralelepípedo',
is_exact_search=True,
reference_date=datetime(2023, 2, 9),
territory_id=None,
excerpt_size=excerpt_size,
number_of_excerpts=number_of_excerpts
)

assert payload == expected_payload

assert payload == expected_payload
@pytest.mark.parametrize(
'is_exact_search, expected_search_term',
[
(True, '"paralelepípedo"'),
(False, 'paralelepípedo')
]
)
def test_search_with_is_exact_search(is_exact_search, expected_search_term):

payload = _build_query_payload(
search_term='paralelepípedo',
is_exact_search=is_exact_search,
reference_date=datetime(2023, 2, 9),
territory_id=None,
excerpt_size=250,
number_of_excerpts=3
)
querystring = payload[-1][1]

assert querystring == expected_search_term

0 comments on commit 1fa2d43

Please sign in to comment.