From 1fa2d4379a2249b2843ed88c2eb82fa5f02b4d12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Lima?= Date: Mon, 17 Feb 2025 10:42:29 -0300 Subject: [PATCH] =?UTF-8?q?Adiciona=20novos=20par=C3=A2metros=20para=20o?= =?UTF-8?q?=20QD=20#171?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dag_confs/examples_and_tests/qd_example.yaml | 13 +- .../qd_list_territory_id_example.yaml | 6 +- docs/docs/como_funciona/exemplos.md | 11 +- docs/docs/como_funciona/parametros.md | 2 + src/dou_dag_generator.py | 11 +- src/schemas.py | 10 ++ src/searchers.py | 65 +++++++--- tests/parsers_test.py | 7 +- tests/qd_searcher_test.py | 121 ++++++++++++++++-- 9 files changed, 196 insertions(+), 50 deletions(-) diff --git a/dag_confs/examples_and_tests/qd_example.yaml b/dag_confs/examples_and_tests/qd_example.yaml index 920e658..a1796e5 100644 --- a/dag_confs/examples_and_tests/qd_example.yaml +++ b/dag_confs/examples_and_tests/qd_example.yaml @@ -3,12 +3,17 @@ dag: description: DAG de teste search: sources: - - QD + - QD territory_id: 3106200 terms: - - pandemia - - dados pessoais - - prefeitura + - pandemia + - prefeitura + - '"dados pessoais"' + - '"Software+valor"~100' + - '"\"Alimentação escolar\"+valor"~100' + is_exact_search: False + number_of_excerpts: 5 + excerpt_size: 500 report: emails: - destination@economia.gov.br diff --git a/dag_confs/examples_and_tests/qd_list_territory_id_example.yaml b/dag_confs/examples_and_tests/qd_list_territory_id_example.yaml index d9bb3d2..af17403 100644 --- a/dag_confs/examples_and_tests/qd_list_territory_id_example.yaml +++ b/dag_confs/examples_and_tests/qd_list_territory_id_example.yaml @@ -14,8 +14,10 @@ dag: terms: - LGPD - RIO DE JANEIRO - force_rematch: On - ignore_signature_match: On + - DADOS PESSOAIS + is_exact_search: True + number_of_excerpts: 5 + excerpt_size: 500 report: emails: - destination@economia.gov.br diff --git a/docs/docs/como_funciona/exemplos.md b/docs/docs/como_funciona/exemplos.md index e9b8d96..b51dcbf 100644 --- a/docs/docs/como_funciona/exemplos.md +++ b/docs/docs/como_funciona/exemplos.md @@ -134,9 +134,14 @@ dag: - QD territory_id: 3106200 # Belo Horizonte terms: - - pandemia - - dados pessoais - - prefeitura + - pandemia + - prefeitura + - '"dados pessoais"' + - '"Software+valor"~100' + - '"\"Alimentação escolar\"+valor"~100' + is_exact_search: False + number_of_excerpts: 5 + excerpt_size: 500 report: emails: - destination@gestao.gov.br diff --git a/docs/docs/como_funciona/parametros.md b/docs/docs/como_funciona/parametros.md index 1d09a27..cc0b99c 100644 --- a/docs/docs/como_funciona/parametros.md +++ b/docs/docs/como_funciona/parametros.md @@ -26,6 +26,8 @@ A página abaixo lista os parâmetros configuráveis nos arquivos YAML: - **sources**: Fontes de pesquisa dos diários oficiais. Pode ser uma ou uma lista. Opções disponíveis: DOU, QD, INLABS. - **terms**: Lista de termos a serem buscados. Para o INLABS podem ser utilizados operadores avançados de busca. - **territory_id**: Lista de identificadores do id do município. Necessário para buscar no Querido Diário. +- **excerpt_size**: Número máximo de caracteres exibidos no trecho onde o termo de busca foi localizado. (Funcionalidade disponível apenas no Querido Diário) +- **number_of_excerpts**: Número máximo de ocorrências do termo de busca em uma mesma edição. (Funcionalidade disponível apenas no Querido Diário) ## Parâmetros do Relatório (Report) - **attach_csv**: Anexar no email o resultado da pesquisa em CSV. diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index 6a3734c..9f4bcc9 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -276,6 +276,8 @@ def perform_searches( result_as_email: Optional[bool], department: List[str], pubtype: List[str], + excerpt_size: Optional[int], + number_of_excerpts: Optional[int], **context, ) -> dict: """Performs the search in each source and merge the results""" @@ -312,13 +314,10 @@ def perform_searches( qd_result = self.searchers["QD"].exec_search( territory_id=territory_id, term_list=term_list, - dou_sections=dou_sections, - search_date=search_date, - field=field, is_exact_search=is_exact_search, - ignore_signature_match=ignore_signature_match, - force_rematch=force_rematch, reference_date=get_trigger_date(context, local_time=True), + excerpt_size=excerpt_size, + number_of_excerpts=number_of_excerpts, result_as_email=result_as_email, ) @@ -491,6 +490,8 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: "use_summary": subsearch.use_summary, "department": subsearch.department, "pubtype": subsearch.pubtype, + "excerpt_size": subsearch.excerpt_size, + "number_of_excerpts": subsearch.number_of_excerpts, "result_as_email": result_as_html(specs), }, ) diff --git a/src/schemas.py b/src/schemas.py index 19d8aa2..4eca358 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -138,6 +138,16 @@ class SearchConfig(BaseModel): pubtype: Optional[List[str]] = Field( default=None, description="Lista de tipo de publicações para filtrar a pesquisa" ) + excerpt_size: Optional[int] = Field( + default=None, + description="Número máximo de caracteres exibidos no trecho onde o termo de busca foi localizado. " + "(Funcionalidade disponível apenas no Querido Diário)" + ) + number_of_excerpts: Optional[int] = Field( + default=None, + description="Número máximo de ocorrências do termo de busca em uma mesma edição. " + "(Funcionalidade disponível apenas no Querido Diário)" + ) class ReportConfig(BaseModel): diff --git a/src/searchers.py b/src/searchers.py index ee2c549..a9724fd 100644 --- a/src/searchers.py +++ b/src/searchers.py @@ -342,16 +342,12 @@ def exec_search( self, territory_id, term_list, - dou_sections: List[str], - search_date, - field, is_exact_search: bool, - ignore_signature_match: bool, - force_rematch: bool, reference_date: datetime, + excerpt_size: int, + number_of_excerpts: int, result_as_email: bool = True, ): - force_rematch = True if force_rematch is None else force_rematch term_list = self._cast_term_list(term_list) tailored_date = reference_date - timedelta(days=1) search_results = {} @@ -359,8 +355,10 @@ def exec_search( results = self._search_term( territory_id=territory_id, search_term=search_term, + is_exact_search=is_exact_search, reference_date=tailored_date, - force_rematch=force_rematch, + excerpt_size=excerpt_size, + number_of_excerpts=number_of_excerpts, result_as_email=result_as_email, ) if results: @@ -373,16 +371,20 @@ def _search_term( self, territory_id, search_term, + is_exact_search, reference_date, - force_rematch: bool, + excerpt_size, + number_of_excerpts, result_as_email: bool = True, ) -> list: - payload = _build_query_payload(search_term, reference_date) - - if territory_id: - if isinstance(territory_id, int): territory_id = [territory_id] - for terr_id in territory_id: - payload.append(("territory_ids", terr_id)) + payload = _build_query_payload( + search_term, + is_exact_search, + reference_date, + territory_id, + excerpt_size, + number_of_excerpts + ) req_result = requests.get(self.API_BASE_URL, params=payload) @@ -414,18 +416,39 @@ def parse_result(self, result: dict, result_as_email: bool = True) -> dict: } -def _build_query_payload(search_term: str, reference_date: datetime) -> List[tuple]: - return [ - ("size", 100), - ("excerpt_size", 250), - ("sort_by", "descending_date"), +def _build_query_payload( + search_term: str, + is_exact_search: bool, + reference_date: datetime, + territory_id, + excerpt_size: int = 250, + number_of_excerpts: int = 3 +) -> List[tuple]: + if is_exact_search: + search_term = f'"{search_term}"' + + size = 100 + payload_territory_id = [] + if territory_id: + if isinstance(territory_id, int): territory_id = [territory_id] + for terr_id in territory_id: + payload_territory_id.append(("territory_ids", terr_id)) + # Como a busca é realizada sempre em um única data, + # no resultado haverá no máximo 1 edição por município + size = len(territory_id) + + payload = [ + ("size", size), + ("excerpt_size", excerpt_size), + ("sort_by", "relevance"), ("pre_tags", "<%%>"), ("post_tags", ""), - ("number_of_excerpts", 3), + ("number_of_excerpts", number_of_excerpts), ("published_since", reference_date.strftime("%Y-%m-%d")), ("published_until", reference_date.strftime("%Y-%m-%d")), - ("querystring", f'"{search_term}"'), + ("querystring", search_term), ] + return payload + payload_territory_id class INLABSSearcher(BaseSearcher): diff --git a/tests/parsers_test.py b/tests/parsers_test.py index 5dcf472..105f2a1 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -605,6 +605,7 @@ "terms": [ "LGPD", "RIO DE JANEIRO", + "DADOS PESSOAIS" ], "header": "Teste com múltiplos territory_id", "sources": ["QD"], @@ -615,12 +616,14 @@ "search_date": "DIA", "field": "TUDO", "is_exact_search": True, - "ignore_signature_match": True, - "force_rematch": True, + "ignore_signature_match": False, + "force_rematch": False, "full_text": False, "use_summary": False, "department": None, "pubtype": None, + "number_of_excerpts": 5, + "excerpt_size": 500, } ], "report": { diff --git a/tests/qd_searcher_test.py b/tests/qd_searcher_test.py index 1535aa3..06f19ed 100644 --- a/tests/qd_searcher_test.py +++ b/tests/qd_searcher_test.py @@ -64,12 +64,14 @@ def test_build_query_payload(pre_tags: str, post_tags: str): payload = _build_query_payload( search_term='paralelepípedo', + is_exact_search=True, reference_date=datetime(2023, 2, 9), + territory_id=None, ) expected = [ ('size', 100), ('excerpt_size', 250), - ('sort_by', 'descending_date'), + ('sort_by', 'relevance'), ('pre_tags', pre_tags), ('post_tags', post_tags), ('number_of_excerpts', 3), @@ -80,22 +82,115 @@ def test_build_query_payload(pre_tags: str, assert payload == expected - @pytest.mark.parametrize( 'territory_id, expected_payload', [ - (3300100, [('territory_ids', 3300100)]), - ([3300100, 3300159], [('territory_ids', 3300100), ('territory_ids', 3300159)]), + ( + None, + [ + ('size', 100), + ('excerpt_size', 250), + ('sort_by', 'relevance'), + ('pre_tags', "<%%>"), + ('post_tags', ""), + ('number_of_excerpts', 3), + ('published_since', '2023-02-09'), + ('published_until', '2023-02-09'), + ('querystring', '"paralelepípedo"') + ] + ), + ( + 3303302, + [ + ('size', 1), + ('excerpt_size', 250), + ('sort_by', 'relevance'), + ('pre_tags', "<%%>"), + ('post_tags', ""), + ('number_of_excerpts', 3), + ('published_since', '2023-02-09'), + ('published_until', '2023-02-09'), + ('querystring', '"paralelepípedo"'), + ('territory_ids', 3303302) + ] + ), + ( + [3303302, 3303303], + [ + ('size', 2), + ('excerpt_size', 250), + ('sort_by', 'relevance'), + ('pre_tags', "<%%>"), + ('post_tags', ""), + ('number_of_excerpts', 3), + ('published_since', '2023-02-09'), + ('published_until', '2023-02-09'), + ('querystring', '"paralelepípedo"'), + ('territory_ids', 3303302), + ('territory_ids', 3303303) + ] + ), ] ) -def test_search_with_multiple_territory_ids(territory_id, expected_payload): - #searcher = QDSearcher() - payload = [] +def test_build_query_payload_territory_id_and_size(territory_id, expected_payload): + payload = _build_query_payload( + search_term='paralelepípedo', + is_exact_search=True, + reference_date=datetime(2023, 2, 9), + territory_id=territory_id + ) + + assert payload == expected_payload - # Simula a lógica que foi alterada para suportar múltiplos IDs de território - if isinstance(territory_id, int): - territory_id = [territory_id] - for terr_id in territory_id: - payload.append(('territory_ids', terr_id)) +@pytest.mark.parametrize( + 'excerpt_size, number_of_excerpts, expected_payload', + [ + ( + 500, + 5, + [ + ('size', 100), + ('excerpt_size', 500), + ('sort_by', 'relevance'), + ('pre_tags', "<%%>"), + ('post_tags', ""), + ('number_of_excerpts', 5), + ('published_since', '2023-02-09'), + ('published_until', '2023-02-09'), + ('querystring', '"paralelepípedo"') + ] + ) + ] +) +def test_build_query_payload_excerpt_params(excerpt_size, number_of_excerpts, expected_payload): + payload = _build_query_payload( + search_term='paralelepípedo', + is_exact_search=True, + reference_date=datetime(2023, 2, 9), + territory_id=None, + excerpt_size=excerpt_size, + number_of_excerpts=number_of_excerpts + ) + + assert payload == expected_payload - assert payload == expected_payload \ No newline at end of file +@pytest.mark.parametrize( + 'is_exact_search, expected_search_term', + [ + (True, '"paralelepípedo"'), + (False, 'paralelepípedo') + ] +) +def test_search_with_is_exact_search(is_exact_search, expected_search_term): + + payload = _build_query_payload( + search_term='paralelepípedo', + is_exact_search=is_exact_search, + reference_date=datetime(2023, 2, 9), + territory_id=None, + excerpt_size=250, + number_of_excerpts=3 + ) + querystring = payload[-1][1] + + assert querystring == expected_search_term \ No newline at end of file