Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Criado classe extracao para Figuras, Tabelas e Formulas #400

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions packtools/sps/models/figures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from packtools.sps.utils import xml_utils


def get_node_without_subtag(node):
"""
Função que retorna nó sem subtags.
"""
return "".join(node.xpath(".//text()"))


class Figure:
def __init__(self, xmltree):
self.xmltree = xmltree


def extract_figures(self, subtag):
fig_node = self.xmltree.xpath('.//fig-group') or self.xmltree.xpath('.//fig')
extract_node_text = xml_utils.node_text_without_xref if subtag else get_node_without_subtag

if fig_node:
if self.xmltree.xpath('.//fig-group'):
return self._extract_figures_with_fig_group(node=fig_node, extract_node_text=extract_node_text)
else:
return self._extract_figures_without_fig_group(node=fig_node, extract_node_text=extract_node_text)
else:
return 'No figures found.'


def _extract_figures_with_fig_group(self, node, extract_node_text):
figures = []

for fig_group_node in node:
fig_group_id = fig_group_node.get('id', '')

try:
fig_group_title = extract_node_text(fig_group_node.xpath('.//title')[0])
except IndexError:
fig_group_title = ''

fig_group = {'fig_group_id': fig_group_id, 'fig_group_title': fig_group_title}

data = self._extract_figures_without_fig_group(node=fig_group_node.xpath('fig'), extract_node_text=extract_node_text)
fig_group.update(data)

figures.append(fig_group)
return figures


def _extract_figures_without_fig_group(self, node, extract_node_text):
figures = {'figs': []}
data_fig = ['label', 'title']

for fig in node:
fig_id = fig.get('id', '')
data = {'id': fig_id}

for field in data_fig:
try:
data[field] = extract_node_text(fig.xpath(f'.//{field}')[0])
except IndexError:
data[field] = ''

try:
fig_graphic = fig.xpath('graphic')[0].get('{http://www.w3.org/1999/xlink}href')
except IndexError:
fig_graphic = ''

data['graphic'] = fig_graphic
figures['figs'].append(data)
return figures


53 changes: 53 additions & 0 deletions packtools/sps/models/formula.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from packtools.sps.utils import xml_utils

class Formula:
def __init__(self, xmltree):
self.xmltree = xmltree


@property
def disp_formula_nodes(self):
return self.xmltree.xpath('.//disp-formula')


def get_equation(self, node):
mnl_namespace = {'mnl': "http://www.w3.org/1998/Math/MathML"}
math_node_xpath = 'mnl:math'
tex_math_xpath = 'tex-math'
graphic_xpath = 'graphic'

if node.xpath(math_node_xpath, namespaces=mnl_namespace) or node.xpath(tex_math_xpath):
eq_node = node.xpath(math_node_xpath, namespaces=mnl_namespace) or node.xpath(tex_math_xpath)
eq_node_id = eq_node[0].get('id', '')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samuelveigarangel na ausência de dados, usar None

eq = xml_utils.node_text_without_xref(eq_node[0])
eq_dict = {'id': eq_node_id, 'equation': eq}
return eq_dict
elif node.xpath(graphic_xpath):
eq_node = node.xpath(graphic_xpath)[0]
eq_node_id = eq_node.get('id', '')
eq_graphic = eq_node.get('{http://www.w3.org/1999/xlink}href')
eq_dict = {'id': eq_node_id, 'graphic': eq_graphic}
return eq_dict
return 'Not found formulas'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samuelveigarangel retornar None



@property
def extract_disp_formula(self):
node = self.disp_formula_nodes
formulas = {'formulas': []}
for disp_node in node:
disp_node_id = disp_node.get('id', '')

try:
disp_node_label = disp_node.xpath('label')[0].text
except IndexError:
disp_node_label = ''
equation = self.get_equation(node=disp_node)

formula = {
'disp_formula_id': disp_node_id,
'disp_formula_label': disp_node_label,
'equations': equation
}
formulas['formulas'].append(formula)
return formulas
73 changes: 73 additions & 0 deletions packtools/sps/models/tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from packtools.sps.utils import xml_utils


def get_node_without_subtag(node):
"""
Função que retorna nó sem subtags.
"""
return "".join(node.xpath(".//text()"))


class Table:
def __init__(self, xmltree):
self.xmltree = xmltree


def extract_table(self, subtag):
table_node = self.xmltree.xpath('.//table-wrap-group') or self.xmltree.xpath('.//table-wrap')
extract_node_text = xml_utils.node_text_without_xref if subtag else get_node_without_subtag

if table_node:
if self.xmltree.xpath('.//table-wrap-group'):
return self._extract_table_with_table_wrap_group(node=table_node, extract_node_text=extract_node_text)
else:
return self._extract_table_without_table_wrap_group(node=table_node, extract_node_text=extract_node_text)
else:
return 'No tables found.'


def _extract_table_with_table_wrap_group(self, node, extract_node_text):
tables = []

for table_node in node:
table_group_id = table_node.get('id', '')
table_group = {'table_group_id': table_group_id}
data = self._extract_table_without_table_wrap_group(node=table_node.xpath('.//table-wrap'), extract_node_text=extract_node_text)
table_group.update(data)
tables.append(table_group)
return tables


def _extract_table_without_table_wrap_group(self, node, extract_node_text):
tables = {'tables': []}
data_tables = ['label', 'title']

for table_node in node:
table_id = table_node.get('id', '')
data = {'id': table_id}

for field in data_tables:
try:
data[field] = extract_node_text(table_node.xpath(f'.//{field}')[0])
except IndexError:
data[field] = ''

try:
data['table'] = xml_utils.node_text_without_xref(table_node.xpath('table')[0])
except IndexError:
data['table'] = ''

foot = self.extract_table_wrap_foot(node=table_node, extract_node_text=extract_node_text)

data.update(foot)
tables['tables'].append(data)
return tables


def extract_table_wrap_foot(self, node, extract_node_text):
try:
foot = extract_node_text(node.xpath('table-wrap-foot')[0])
except IndexError:
foot = ''
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

se ausente retornar None

foot = {'wrap-foot': foot}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samuelveigarangel usar table-wrap-foot no lugar de apenas wrap-foot

return foot
181 changes: 181 additions & 0 deletions tests/sps/test_figure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from unittest import TestCase
from packtools.sps.utils import xml_utils


from packtools.sps.models.figures import Figure

from lxml import etree


class FiguresTest(TestCase):
def test_extract_with_fig_group(self):
xml = ("""
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" dtd-version="1.0" article-type="research-article" xml:lang="en">
<front>
<article-meta>
<fig-group id="dogpix4">
<caption><title>Figures 12-14 Bonnie Lassie</title>
<p>Three perspectives on My Dog</p>
</caption>
<fig id="fg-12">
<label>a.</label>
<caption>
<title><p>View A: From the Front, Laughing</p></title>
</caption>
<graphic xlink:href="frontView.png"/>
</fig>
<fig id="fg-13">
<label>b.</label>
<caption>
<title><p>View B: From the Side, Best Profile</p></title>
</caption>
<graphic xlink:href="sideView.png"/>
</fig>
<fig id="fg-14">
<label>c.</label>
<caption>
<title><p>View C: In Motion, A Blur on Feet</p></title>
</caption>
<graphic xlink:href="motionView.png"/>
</fig>
</fig-group>
</article-meta>
</front>
</article>
""")
xml = etree.fromstring(xml)
extract = Figure(xml).extract_figures(subtag=False)

expected_output = [
{
'fig_group_id': 'dogpix4',
'fig_group_title': 'Figures 12-14 Bonnie Lassie',
'figs': [
{
'id': 'fg-12',
'title': 'View A: From the Front, Laughing',
'label': 'a.',
'graphic': 'frontView.png'
},
{
'id': 'fg-13',
'title': 'View B: From the Side, Best Profile',
'label': 'b.',
'graphic': 'sideView.png'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@samuelveigarangel em <graphic> há outros atributos além de xref:href, e deveriam ser representados no dicionário. Veja aqueles que fig contém alternatives

},
{
'id': 'fg-14',
'title': 'View C: In Motion, A Blur on Feet',
'label': 'c.',
'graphic': 'motionView.png'
}
]
}
]

self.assertEqual(extract, expected_output)


def test_extract_with_fig_group_and_subtag(self):
xml = ("""
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" dtd-version="1.0" article-type="research-article" xml:lang="en">
<front>
<article-meta>
<fig-group id="dogpix4">
<caption><title>Figures 12-14 Bonnie Lassie</title>
<p>Three perspectives on My Dog</p>
</caption>
<fig id="fg-12">
<label>a.</label>
<caption>
<title><p>View A: From the Front, Laughing</p></title>
</caption>
<graphic xlink:href="frontView.png"/>
</fig>
<fig id="fg-13">
<label>b.</label>
<caption>
<title><p>View B: From the Side, Best Profile</p></title>
</caption>
<graphic xlink:href="sideView.png"/>
</fig>
<fig id="fg-14">
<label>c.</label>
<caption>
<title><p>View C: In <italic>Motion</italic>, A Blur on Feet</p></title>
</caption>
<graphic xlink:href="motionView.png"/>
</fig>
</fig-group>
</article-meta>
</front>
</article>
""")
xml = etree.fromstring(xml)
extract = Figure(xml).extract_figures(subtag=True)

expected_output = [
{
'fig_group_id': 'dogpix4',
'fig_group_title': 'Figures 12-14 Bonnie Lassie',
'figs': [
{
'id': 'fg-12',
'title': '<p>View A: From the Front, Laughing</p>',
'label': 'a.',
'graphic': 'frontView.png'
},
{
'id': 'fg-13',
'title': '<p>View B: From the Side, Best Profile</p>',
'label': 'b.',
'graphic': 'sideView.png'
},
{
'id': 'fg-14',
'title': '<p>View C: In <italic>Motion</italic>, A Blur on Feet</p>',
'label': 'c.',
'graphic': 'motionView.png'
}
]
}
]

self.assertEqual(extract, expected_output)


def test_extract_without_fig_group(self):
xml= xml_utils.get_xml_tree('tests/samples/0034-8910-rsp-48-2-0206.xml')
extract = Figure(xml).extract_figures(subtag=False)

expect_output = {
'figs': [
{
'id': 'f01',
'label': 'Figure 1',
'title': 'Graphical representation of the characteristic and information curves of the items selected.',
'graphic': '0034-8910-rsp-48-2-0206-gf01'
},
{
'id': 'f02',
'label': 'Figure 2',
'title': 'Total Information curve (10 items).',
'graphic': '0034-8910-rsp-48-2-0206-gf02'
},
{
'id': 'f03',
'label':
'Figure 3',
'title': 'Graphical representation of the items and HIV/AIDS knowledge scores.',
'graphic': '0034-8910-rsp-48-2-0206-gf03'
},
{
'id': 'f04',
'label': 'Figure 4',
'title': 'Items characteristic curves with differential item functioning.',
'graphic': '0034-8910-rsp-48-2-0206-gf04'
}
]
}

self.assertEqual(extract, expect_output)
Loading