-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Criado classe extracao para Figuras, Tabelas e Formulas #400
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from packtools.sps.utils import xml_utils | ||
|
||
|
||
def get_node_without_subtag(node): | ||
""" | ||
Função que retorna nó sem subtags. | ||
""" | ||
return "".join(node.xpath(".//text()")) | ||
|
||
|
||
class Figure: | ||
def __init__(self, xmltree): | ||
self.xmltree = xmltree | ||
|
||
|
||
def extract_figures(self, subtag): | ||
fig_node = self.xmltree.xpath('.//fig-group') or self.xmltree.xpath('.//fig') | ||
extract_node_text = xml_utils.node_text_without_xref if subtag else get_node_without_subtag | ||
|
||
if fig_node: | ||
if self.xmltree.xpath('.//fig-group'): | ||
return self._extract_figures_with_fig_group(node=fig_node, extract_node_text=extract_node_text) | ||
else: | ||
return self._extract_figures_without_fig_group(node=fig_node, extract_node_text=extract_node_text) | ||
else: | ||
return 'No figures found.' | ||
|
||
|
||
def _extract_figures_with_fig_group(self, node, extract_node_text): | ||
figures = [] | ||
|
||
for fig_group_node in node: | ||
fig_group_id = fig_group_node.get('id', '') | ||
|
||
try: | ||
fig_group_title = extract_node_text(fig_group_node.xpath('.//title')[0]) | ||
except IndexError: | ||
fig_group_title = '' | ||
|
||
fig_group = {'fig_group_id': fig_group_id, 'fig_group_title': fig_group_title} | ||
|
||
data = self._extract_figures_without_fig_group(node=fig_group_node.xpath('fig'), extract_node_text=extract_node_text) | ||
fig_group.update(data) | ||
|
||
figures.append(fig_group) | ||
return figures | ||
|
||
|
||
def _extract_figures_without_fig_group(self, node, extract_node_text): | ||
figures = {'figs': []} | ||
data_fig = ['label', 'title'] | ||
|
||
for fig in node: | ||
fig_id = fig.get('id', '') | ||
data = {'id': fig_id} | ||
|
||
for field in data_fig: | ||
try: | ||
data[field] = extract_node_text(fig.xpath(f'.//{field}')[0]) | ||
except IndexError: | ||
data[field] = '' | ||
|
||
try: | ||
fig_graphic = fig.xpath('graphic')[0].get('{http://www.w3.org/1999/xlink}href') | ||
except IndexError: | ||
fig_graphic = '' | ||
|
||
data['graphic'] = fig_graphic | ||
figures['figs'].append(data) | ||
return figures | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from packtools.sps.utils import xml_utils | ||
|
||
class Formula: | ||
def __init__(self, xmltree): | ||
self.xmltree = xmltree | ||
|
||
|
||
@property | ||
def disp_formula_nodes(self): | ||
return self.xmltree.xpath('.//disp-formula') | ||
|
||
|
||
def get_equation(self, node): | ||
mnl_namespace = {'mnl': "http://www.w3.org/1998/Math/MathML"} | ||
math_node_xpath = 'mnl:math' | ||
tex_math_xpath = 'tex-math' | ||
graphic_xpath = 'graphic' | ||
|
||
if node.xpath(math_node_xpath, namespaces=mnl_namespace) or node.xpath(tex_math_xpath): | ||
eq_node = node.xpath(math_node_xpath, namespaces=mnl_namespace) or node.xpath(tex_math_xpath) | ||
eq_node_id = eq_node[0].get('id', '') | ||
eq = xml_utils.node_text_without_xref(eq_node[0]) | ||
eq_dict = {'id': eq_node_id, 'equation': eq} | ||
return eq_dict | ||
elif node.xpath(graphic_xpath): | ||
eq_node = node.xpath(graphic_xpath)[0] | ||
eq_node_id = eq_node.get('id', '') | ||
eq_graphic = eq_node.get('{http://www.w3.org/1999/xlink}href') | ||
eq_dict = {'id': eq_node_id, 'graphic': eq_graphic} | ||
return eq_dict | ||
return 'Not found formulas' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @samuelveigarangel retornar None |
||
|
||
|
||
@property | ||
def extract_disp_formula(self): | ||
node = self.disp_formula_nodes | ||
formulas = {'formulas': []} | ||
for disp_node in node: | ||
disp_node_id = disp_node.get('id', '') | ||
|
||
try: | ||
disp_node_label = disp_node.xpath('label')[0].text | ||
except IndexError: | ||
disp_node_label = '' | ||
equation = self.get_equation(node=disp_node) | ||
|
||
formula = { | ||
'disp_formula_id': disp_node_id, | ||
'disp_formula_label': disp_node_label, | ||
'equations': equation | ||
} | ||
formulas['formulas'].append(formula) | ||
return formulas |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from packtools.sps.utils import xml_utils | ||
|
||
|
||
def get_node_without_subtag(node): | ||
""" | ||
Função que retorna nó sem subtags. | ||
""" | ||
return "".join(node.xpath(".//text()")) | ||
|
||
|
||
class Table: | ||
def __init__(self, xmltree): | ||
self.xmltree = xmltree | ||
|
||
|
||
def extract_table(self, subtag): | ||
table_node = self.xmltree.xpath('.//table-wrap-group') or self.xmltree.xpath('.//table-wrap') | ||
extract_node_text = xml_utils.node_text_without_xref if subtag else get_node_without_subtag | ||
|
||
if table_node: | ||
if self.xmltree.xpath('.//table-wrap-group'): | ||
return self._extract_table_with_table_wrap_group(node=table_node, extract_node_text=extract_node_text) | ||
else: | ||
return self._extract_table_without_table_wrap_group(node=table_node, extract_node_text=extract_node_text) | ||
else: | ||
return 'No tables found.' | ||
|
||
|
||
def _extract_table_with_table_wrap_group(self, node, extract_node_text): | ||
tables = [] | ||
|
||
for table_node in node: | ||
table_group_id = table_node.get('id', '') | ||
table_group = {'table_group_id': table_group_id} | ||
data = self._extract_table_without_table_wrap_group(node=table_node.xpath('.//table-wrap'), extract_node_text=extract_node_text) | ||
table_group.update(data) | ||
tables.append(table_group) | ||
return tables | ||
|
||
|
||
def _extract_table_without_table_wrap_group(self, node, extract_node_text): | ||
tables = {'tables': []} | ||
data_tables = ['label', 'title'] | ||
|
||
for table_node in node: | ||
table_id = table_node.get('id', '') | ||
data = {'id': table_id} | ||
|
||
for field in data_tables: | ||
try: | ||
data[field] = extract_node_text(table_node.xpath(f'.//{field}')[0]) | ||
except IndexError: | ||
data[field] = '' | ||
|
||
try: | ||
data['table'] = xml_utils.node_text_without_xref(table_node.xpath('table')[0]) | ||
except IndexError: | ||
data['table'] = '' | ||
|
||
foot = self.extract_table_wrap_foot(node=table_node, extract_node_text=extract_node_text) | ||
|
||
data.update(foot) | ||
tables['tables'].append(data) | ||
return tables | ||
|
||
|
||
def extract_table_wrap_foot(self, node, extract_node_text): | ||
try: | ||
foot = extract_node_text(node.xpath('table-wrap-foot')[0]) | ||
except IndexError: | ||
foot = '' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. se ausente retornar None |
||
foot = {'wrap-foot': foot} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @samuelveigarangel usar |
||
return foot |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
from unittest import TestCase | ||
from packtools.sps.utils import xml_utils | ||
|
||
|
||
from packtools.sps.models.figures import Figure | ||
|
||
from lxml import etree | ||
|
||
|
||
class FiguresTest(TestCase): | ||
def test_extract_with_fig_group(self): | ||
xml = (""" | ||
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" dtd-version="1.0" article-type="research-article" xml:lang="en"> | ||
<front> | ||
<article-meta> | ||
<fig-group id="dogpix4"> | ||
<caption><title>Figures 12-14 Bonnie Lassie</title> | ||
<p>Three perspectives on My Dog</p> | ||
</caption> | ||
<fig id="fg-12"> | ||
<label>a.</label> | ||
<caption> | ||
<title><p>View A: From the Front, Laughing</p></title> | ||
</caption> | ||
<graphic xlink:href="frontView.png"/> | ||
</fig> | ||
<fig id="fg-13"> | ||
<label>b.</label> | ||
<caption> | ||
<title><p>View B: From the Side, Best Profile</p></title> | ||
</caption> | ||
<graphic xlink:href="sideView.png"/> | ||
</fig> | ||
<fig id="fg-14"> | ||
<label>c.</label> | ||
<caption> | ||
<title><p>View C: In Motion, A Blur on Feet</p></title> | ||
</caption> | ||
<graphic xlink:href="motionView.png"/> | ||
</fig> | ||
</fig-group> | ||
</article-meta> | ||
</front> | ||
</article> | ||
""") | ||
xml = etree.fromstring(xml) | ||
extract = Figure(xml).extract_figures(subtag=False) | ||
|
||
expected_output = [ | ||
{ | ||
'fig_group_id': 'dogpix4', | ||
'fig_group_title': 'Figures 12-14 Bonnie Lassie', | ||
'figs': [ | ||
{ | ||
'id': 'fg-12', | ||
'title': 'View A: From the Front, Laughing', | ||
'label': 'a.', | ||
'graphic': 'frontView.png' | ||
}, | ||
{ | ||
'id': 'fg-13', | ||
'title': 'View B: From the Side, Best Profile', | ||
'label': 'b.', | ||
'graphic': 'sideView.png' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @samuelveigarangel em |
||
}, | ||
{ | ||
'id': 'fg-14', | ||
'title': 'View C: In Motion, A Blur on Feet', | ||
'label': 'c.', | ||
'graphic': 'motionView.png' | ||
} | ||
] | ||
} | ||
] | ||
|
||
self.assertEqual(extract, expected_output) | ||
|
||
|
||
def test_extract_with_fig_group_and_subtag(self): | ||
xml = (""" | ||
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" dtd-version="1.0" article-type="research-article" xml:lang="en"> | ||
<front> | ||
<article-meta> | ||
<fig-group id="dogpix4"> | ||
<caption><title>Figures 12-14 Bonnie Lassie</title> | ||
<p>Three perspectives on My Dog</p> | ||
</caption> | ||
<fig id="fg-12"> | ||
<label>a.</label> | ||
<caption> | ||
<title><p>View A: From the Front, Laughing</p></title> | ||
</caption> | ||
<graphic xlink:href="frontView.png"/> | ||
</fig> | ||
<fig id="fg-13"> | ||
<label>b.</label> | ||
<caption> | ||
<title><p>View B: From the Side, Best Profile</p></title> | ||
</caption> | ||
<graphic xlink:href="sideView.png"/> | ||
</fig> | ||
<fig id="fg-14"> | ||
<label>c.</label> | ||
<caption> | ||
<title><p>View C: In <italic>Motion</italic>, A Blur on Feet</p></title> | ||
</caption> | ||
<graphic xlink:href="motionView.png"/> | ||
</fig> | ||
</fig-group> | ||
</article-meta> | ||
</front> | ||
</article> | ||
""") | ||
xml = etree.fromstring(xml) | ||
extract = Figure(xml).extract_figures(subtag=True) | ||
|
||
expected_output = [ | ||
{ | ||
'fig_group_id': 'dogpix4', | ||
'fig_group_title': 'Figures 12-14 Bonnie Lassie', | ||
'figs': [ | ||
{ | ||
'id': 'fg-12', | ||
'title': '<p>View A: From the Front, Laughing</p>', | ||
'label': 'a.', | ||
'graphic': 'frontView.png' | ||
}, | ||
{ | ||
'id': 'fg-13', | ||
'title': '<p>View B: From the Side, Best Profile</p>', | ||
'label': 'b.', | ||
'graphic': 'sideView.png' | ||
}, | ||
{ | ||
'id': 'fg-14', | ||
'title': '<p>View C: In <italic>Motion</italic>, A Blur on Feet</p>', | ||
'label': 'c.', | ||
'graphic': 'motionView.png' | ||
} | ||
] | ||
} | ||
] | ||
|
||
self.assertEqual(extract, expected_output) | ||
|
||
|
||
def test_extract_without_fig_group(self): | ||
xml= xml_utils.get_xml_tree('tests/samples/0034-8910-rsp-48-2-0206.xml') | ||
extract = Figure(xml).extract_figures(subtag=False) | ||
|
||
expect_output = { | ||
'figs': [ | ||
{ | ||
'id': 'f01', | ||
'label': 'Figure 1', | ||
'title': 'Graphical representation of the characteristic and information curves of the items selected.', | ||
'graphic': '0034-8910-rsp-48-2-0206-gf01' | ||
}, | ||
{ | ||
'id': 'f02', | ||
'label': 'Figure 2', | ||
'title': 'Total Information curve (10 items).', | ||
'graphic': '0034-8910-rsp-48-2-0206-gf02' | ||
}, | ||
{ | ||
'id': 'f03', | ||
'label': | ||
'Figure 3', | ||
'title': 'Graphical representation of the items and HIV/AIDS knowledge scores.', | ||
'graphic': '0034-8910-rsp-48-2-0206-gf03' | ||
}, | ||
{ | ||
'id': 'f04', | ||
'label': 'Figure 4', | ||
'title': 'Items characteristic curves with differential item functioning.', | ||
'graphic': '0034-8910-rsp-48-2-0206-gf04' | ||
} | ||
] | ||
} | ||
|
||
self.assertEqual(extract, expect_output) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@samuelveigarangel na ausência de dados, usar
None