diff --git a/packtools/sps/models/figures.py b/packtools/sps/models/figures.py new file mode 100644 index 000000000..038899c7a --- /dev/null +++ b/packtools/sps/models/figures.py @@ -0,0 +1,72 @@ +from packtools.sps.utils import xml_utils + + +def get_node_without_subtag(node): + """ + Função que retorna nó sem subtags. + """ + return "".join(node.xpath(".//text()")) + + +class Figure: + def __init__(self, xmltree): + self.xmltree = xmltree + + + def extract_figures(self, subtag): + fig_node = self.xmltree.xpath('.//fig-group') or self.xmltree.xpath('.//fig') + extract_node_text = xml_utils.node_text_without_xref if subtag else get_node_without_subtag + + if fig_node: + if self.xmltree.xpath('.//fig-group'): + return self._extract_figures_with_fig_group(node=fig_node, extract_node_text=extract_node_text) + else: + return self._extract_figures_without_fig_group(node=fig_node, extract_node_text=extract_node_text) + else: + return 'No figures found.' + + + def _extract_figures_with_fig_group(self, node, extract_node_text): + figures = [] + + for fig_group_node in node: + fig_group_id = fig_group_node.get('id', '') + + try: + fig_group_title = extract_node_text(fig_group_node.xpath('.//title')[0]) + except IndexError: + fig_group_title = '' + + fig_group = {'fig_group_id': fig_group_id, 'fig_group_title': fig_group_title} + + data = self._extract_figures_without_fig_group(node=fig_group_node.xpath('fig'), extract_node_text=extract_node_text) + fig_group.update(data) + + figures.append(fig_group) + return figures + + + def _extract_figures_without_fig_group(self, node, extract_node_text): + figures = {'figs': []} + data_fig = ['label', 'title'] + + for fig in node: + fig_id = fig.get('id', '') + data = {'id': fig_id} + + for field in data_fig: + try: + data[field] = extract_node_text(fig.xpath(f'.//{field}')[0]) + except IndexError: + data[field] = '' + + try: + fig_graphic = fig.xpath('graphic')[0].get('{http://www.w3.org/1999/xlink}href') + except IndexError: + fig_graphic = '' + + data['graphic'] = fig_graphic + figures['figs'].append(data) + return figures + + \ No newline at end of file diff --git a/packtools/sps/models/formula.py b/packtools/sps/models/formula.py new file mode 100644 index 000000000..83534dcb1 --- /dev/null +++ b/packtools/sps/models/formula.py @@ -0,0 +1,53 @@ +from packtools.sps.utils import xml_utils + +class Formula: + def __init__(self, xmltree): + self.xmltree = xmltree + + + @property + def disp_formula_nodes(self): + return self.xmltree.xpath('.//disp-formula') + + + def get_equation(self, node): + mnl_namespace = {'mnl': "http://www.w3.org/1998/Math/MathML"} + math_node_xpath = 'mnl:math' + tex_math_xpath = 'tex-math' + graphic_xpath = 'graphic' + + if node.xpath(math_node_xpath, namespaces=mnl_namespace) or node.xpath(tex_math_xpath): + eq_node = node.xpath(math_node_xpath, namespaces=mnl_namespace) or node.xpath(tex_math_xpath) + eq_node_id = eq_node[0].get('id', '') + eq = xml_utils.node_text_without_xref(eq_node[0]) + eq_dict = {'id': eq_node_id, 'equation': eq} + return eq_dict + elif node.xpath(graphic_xpath): + eq_node = node.xpath(graphic_xpath)[0] + eq_node_id = eq_node.get('id', '') + eq_graphic = eq_node.get('{http://www.w3.org/1999/xlink}href') + eq_dict = {'id': eq_node_id, 'graphic': eq_graphic} + return eq_dict + return 'Not found formulas' + + + @property + def extract_disp_formula(self): + node = self.disp_formula_nodes + formulas = {'formulas': []} + for disp_node in node: + disp_node_id = disp_node.get('id', '') + + try: + disp_node_label = disp_node.xpath('label')[0].text + except IndexError: + disp_node_label = '' + equation = self.get_equation(node=disp_node) + + formula = { + 'disp_formula_id': disp_node_id, + 'disp_formula_label': disp_node_label, + 'equations': equation + } + formulas['formulas'].append(formula) + return formulas \ No newline at end of file diff --git a/packtools/sps/models/tables.py b/packtools/sps/models/tables.py new file mode 100644 index 000000000..93821aa57 --- /dev/null +++ b/packtools/sps/models/tables.py @@ -0,0 +1,73 @@ +from packtools.sps.utils import xml_utils + + +def get_node_without_subtag(node): + """ + Função que retorna nó sem subtags. + """ + return "".join(node.xpath(".//text()")) + + +class Table: + def __init__(self, xmltree): + self.xmltree = xmltree + + + def extract_table(self, subtag): + table_node = self.xmltree.xpath('.//table-wrap-group') or self.xmltree.xpath('.//table-wrap') + extract_node_text = xml_utils.node_text_without_xref if subtag else get_node_without_subtag + + if table_node: + if self.xmltree.xpath('.//table-wrap-group'): + return self._extract_table_with_table_wrap_group(node=table_node, extract_node_text=extract_node_text) + else: + return self._extract_table_without_table_wrap_group(node=table_node, extract_node_text=extract_node_text) + else: + return 'No tables found.' + + + def _extract_table_with_table_wrap_group(self, node, extract_node_text): + tables = [] + + for table_node in node: + table_group_id = table_node.get('id', '') + table_group = {'table_group_id': table_group_id} + data = self._extract_table_without_table_wrap_group(node=table_node.xpath('.//table-wrap'), extract_node_text=extract_node_text) + table_group.update(data) + tables.append(table_group) + return tables + + + def _extract_table_without_table_wrap_group(self, node, extract_node_text): + tables = {'tables': []} + data_tables = ['label', 'title'] + + for table_node in node: + table_id = table_node.get('id', '') + data = {'id': table_id} + + for field in data_tables: + try: + data[field] = extract_node_text(table_node.xpath(f'.//{field}')[0]) + except IndexError: + data[field] = '' + + try: + data['table'] = xml_utils.node_text_without_xref(table_node.xpath('table')[0]) + except IndexError: + data['table'] = '' + + foot = self.extract_table_wrap_foot(node=table_node, extract_node_text=extract_node_text) + + data.update(foot) + tables['tables'].append(data) + return tables + + + def extract_table_wrap_foot(self, node, extract_node_text): + try: + foot = extract_node_text(node.xpath('table-wrap-foot')[0]) + except IndexError: + foot = '' + foot = {'wrap-foot': foot} + return foot \ No newline at end of file diff --git a/tests/sps/test_figure.py b/tests/sps/test_figure.py new file mode 100644 index 000000000..4c39fdce7 --- /dev/null +++ b/tests/sps/test_figure.py @@ -0,0 +1,181 @@ +from unittest import TestCase +from packtools.sps.utils import xml_utils + + +from packtools.sps.models.figures import Figure + +from lxml import etree + + +class FiguresTest(TestCase): + def test_extract_with_fig_group(self): + xml = (""" +
+ + + + Figures 12-14 Bonnie Lassie +

Three perspectives on My Dog

+ + + + + <p>View A: From the Front, Laughing</p> + + + + + + + <p>View B: From the Side, Best Profile</p> + + + + + + + <p>View C: In Motion, A Blur on Feet</p> + + + +
+
+
+
+ """) + xml = etree.fromstring(xml) + extract = Figure(xml).extract_figures(subtag=False) + + expected_output = [ + { + 'fig_group_id': 'dogpix4', + 'fig_group_title': 'Figures 12-14 Bonnie Lassie', + 'figs': [ + { + 'id': 'fg-12', + 'title': 'View A: From the Front, Laughing', + 'label': 'a.', + 'graphic': 'frontView.png' + }, + { + 'id': 'fg-13', + 'title': 'View B: From the Side, Best Profile', + 'label': 'b.', + 'graphic': 'sideView.png' + }, + { + 'id': 'fg-14', + 'title': 'View C: In Motion, A Blur on Feet', + 'label': 'c.', + 'graphic': 'motionView.png' + } + ] + } + ] + + self.assertEqual(extract, expected_output) + + + def test_extract_with_fig_group_and_subtag(self): + xml = (""" +
+ + + + Figures 12-14 Bonnie Lassie +

Three perspectives on My Dog

+ + + + + <p>View A: From the Front, Laughing</p> + + + + + + + <p>View B: From the Side, Best Profile</p> + + + + + + + <p>View C: In <italic>Motion</italic>, A Blur on Feet</p> + + + +
+
+
+
+ """) + xml = etree.fromstring(xml) + extract = Figure(xml).extract_figures(subtag=True) + + expected_output = [ + { + 'fig_group_id': 'dogpix4', + 'fig_group_title': 'Figures 12-14 Bonnie Lassie', + 'figs': [ + { + 'id': 'fg-12', + 'title': '

View A: From the Front, Laughing

', + 'label': 'a.', + 'graphic': 'frontView.png' + }, + { + 'id': 'fg-13', + 'title': '

View B: From the Side, Best Profile

', + 'label': 'b.', + 'graphic': 'sideView.png' + }, + { + 'id': 'fg-14', + 'title': '

View C: In Motion, A Blur on Feet

', + 'label': 'c.', + 'graphic': 'motionView.png' + } + ] + } + ] + + self.assertEqual(extract, expected_output) + + + def test_extract_without_fig_group(self): + xml= xml_utils.get_xml_tree('tests/samples/0034-8910-rsp-48-2-0206.xml') + extract = Figure(xml).extract_figures(subtag=False) + + expect_output = { + 'figs': [ + { + 'id': 'f01', + 'label': 'Figure 1', + 'title': 'Graphical representation of the characteristic and information curves of the items selected.', + 'graphic': '0034-8910-rsp-48-2-0206-gf01' + }, + { + 'id': 'f02', + 'label': 'Figure 2', + 'title': 'Total Information curve (10 items).', + 'graphic': '0034-8910-rsp-48-2-0206-gf02' + }, + { + 'id': 'f03', + 'label': + 'Figure 3', + 'title': 'Graphical representation of the items and HIV/AIDS knowledge scores.', + 'graphic': '0034-8910-rsp-48-2-0206-gf03' + }, + { + 'id': 'f04', + 'label': 'Figure 4', + 'title': 'Items characteristic curves with differential item functioning.', + 'graphic': '0034-8910-rsp-48-2-0206-gf04' + } + ] + } + + self.assertEqual(extract, expect_output) \ No newline at end of file diff --git a/tests/sps/test_formula.py b/tests/sps/test_formula.py new file mode 100644 index 000000000..06b617121 --- /dev/null +++ b/tests/sps/test_formula.py @@ -0,0 +1,95 @@ +from unittest import TestCase +from packtools.sps.utils import xml_utils + + +from packtools.sps.models.formula import Formula +from lxml import etree + + +class FormulasTest(TestCase): + def test_formula(self): + xml = (r""" +
+ + + + + + + q + c + + = + h + + ( + + T + + + T + 0 + + + ) + + + + + + + + + \documentclass {article} + \usepackage{wasysym} + \usepackage[substack]{amsmath} + \usepackage{amsfonts} + \usepackage{amssymb} + \usepackage{amsbsy} + \usepackage[mathscr]{eucal} + \usepackage{mathrsfs} + \usepackage{pmc} + \usepackage[Euler]{upgreek} + \pagestyle{empty} + \oddsidemargin -1.0in + \begin{document} + \[E_it=α_i+Z_it γ+W_it δ+C_it θ+∑_i^n EFind_i+∑_t^n EFtemp_t+ ε_it \] + \end{document} + + + + + + + +
+ """) + + xml = xml_utils.get_xml_tree(xml) + extract = Formula(xml).extract_disp_formula + + expected_output = { + 'formulas': [ + { + 'disp_formula_id': 'e3', + 'disp_formula_label': '(3)', + 'equation': { + 'id': 'm1', + 'eq': 'qc=h(TT0)'} + }, + { + 'disp_formula_id': 'e10', + 'disp_formula_label': '(1)', + 'equation': { + 'id': 'tx1', + 'eq': '\n \\documentclass {article}\n \\usepackage{wasysym}\n \\usepackage[substack]{amsmath}\n \\usepackage{amsfonts}\n \\usepackage{amssymb}\n \\usepackage{amsbsy}\n \\usepackage[mathscr]{eucal}\n \\usepackage{mathrsfs}\n \\usepackage{pmc}\n \\usepackage[Euler]{upgreek}\n \\pagestyle{empty}\n \\oddsidemargin -1.0in\n \\begin{document}\n \\[E_it=α_i+Z_it γ+W_it δ+C_it θ+∑_i^n EFind_i+∑_t^n EFtemp_t+ ε_it \\]\n \\end{document}\n '} + }, + { + 'disp_formula_id': 'e1', + 'disp_formula_label': '', + 'equation': { + 'id': '', + 'graphic': '1234-5678-rctb-45-05-0110-e01.tif'} + } + ] + } \ No newline at end of file diff --git a/tests/sps/test_table.py b/tests/sps/test_table.py new file mode 100644 index 000000000..cbcc2eb4f --- /dev/null +++ b/tests/sps/test_table.py @@ -0,0 +1,203 @@ +from unittest import TestCase +from packtools.sps.utils import xml_utils + +from lxml import etree + +from packtools.sps.models.tables import Table + + +class TablesTest(TestCase): + def test_extract_tables_with_group(self): + xml = (""" +
+ + + + + + + Proportion of correct HIV/AIDS knowledge responses as reported by the men who have sex with men, the difficulty and discrimination parameters for each item, estimated by Item Response Theory. Brazil, 2008-2009. (N = 3,746) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Item% Correct responseDifficulty (b + + i + )Discrimination (a + + i + )
1. The risk of transmitting HIV is small if one follows the treatment correctly.35.514.450.04
2. People are using less condoms because of AIDS treatment.34.514.230.04
3. A person can get the AIDS virus by using public toilets.78.13.720.95
4. A person can get the AIDS virus through insect bites.75.53.700.72
5. A person can become infected by sharing eating utensils, cups or food.85.73.281.01
6. The risk of HIV + mothers infecting their babies is small if she receives treatment in pregnancy and childbirth.75.82.700.32
7. The risk of HIV infection can be reduced if you have relations only with an uninfected partner.72.62.030.20
8. A healthy person can be infected with the AIDS virus.94.11.610.59
9. A person can get the virus from sharing a syringe or needle.96.91.510.78
10. Anyone can get the AIDS virus if condoms are not used.98.51.270.95
+ +

+ b + + i + : Difficulty parameter of each item; a + + i + : Discrimination parameter of each item

+
+
+
+
+
+
+ """) + + xml = xml_utils.get_xml_tree(xml) + extract = Table(xml).extract_table(subtag=False) + expected_output = [ + { + 'table_group_id': 't01', + 'tables': [ + { + 'id': 't01', + 'label': 'Table. ', + 'title': 'Proportion of correct HIV/AIDS knowledge responses as reported by the men who have sex with men, the difficulty and discrimination parameters for each item, estimated by Item Response Theory. Brazil, 2008-2009. (N = 3,746)', + 'table': 'Item% Correct responseDifficulty (b\n i)Discrimination (a\n i)1. The risk of transmitting HIV is small if one follows the treatment correctly.35.514.450.042. People are using less condoms because of AIDS treatment.34.514.230.043. A person can get the AIDS virus by using public toilets.78.13.720.954. A person can get the AIDS virus through insect bites.75.53.700.725. A person can become infected by sharing eating utensils, cups or food.85.73.281.016. The risk of HIV + mothers infecting their babies is small if she receives treatment in pregnancy and childbirth.75.82.700.327. The risk of HIV infection can be reduced if you have relations only with an uninfected partner.72.62.030.208. A healthy person can be infected with the AIDS virus.94.11.610.599. A person can get the virus from sharing a syringe or needle.96.91.510.7810. Anyone can get the AIDS virus if condoms are not used.98.51.270.95', 'wrap-foot': 'bi: Difficulty parameter of each item; ai: Discrimination parameter of each item' + } + ] + } + ] + + self.assertEqual(extract, expected_output) + + def test_extract_tables_without_group(self): + xml =(""" +
+ + + + + + Alíquota menor para prestadores + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Proposta de Novas Tabelas - 2016
Receita Bruta em 12 Meses - em R$Anexo I - ComércioAnexo II Indústria
De R$ 225.000,01 a RS 450.000,004,00%4,50%
De R$ 450.000,01 a R$ 900.000,008,25%8,00%
De R$ 900.000,01 a R$ 1.800.000,0011,25%12,25%
+ + +

A informação de alíquota do anexo II é significativa

+
+
+
+
+
+
+ """) + xml = xml_utils.get_xml_tree(xml) + extract = Table(xml).extract_table(subtag=False) + + expected_output = { + 'tables': [ + { + 'id': 't5', + 'label': 'Tabela 5', + 'title': 'Alíquota menor para prestadores', + 'table': 'Proposta de Novas Tabelas - 2016Receita Bruta em 12 Meses - em R$Anexo I - ComércioAnexo II IndústriaDe R$ 225.000,01 a RS 450.000,004,00%4,50%De R$ 450.000,01 a R$ 900.000,008,25%8,00%De R$ 900.000,01 a R$ 1.800.000,0011,25%12,25%', 'wrap-foot': 'A informação de alíquota do anexo II é significativa' + } + ] + } + + self.assertEqual(extract, expected_output) \ No newline at end of file